396 files changed, 95599 insertions, 0 deletions
diff --git a/src/gallium/drivers/Makefile b/src/gallium/drivers/Makefile
new file mode 100644
index 0000000000..6161cb6ff8
--- /dev/null
+++ b/src/gallium/drivers/Makefile
@@ -0,0 +1,20 @@
+TOP = ../../..
+include $(TOP)/configs/current
+
+
+SUBDIRS = $(GALLIUM_DRIVER_DIRS)
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
diff --git a/src/gallium/drivers/cell/Makefile b/src/gallium/drivers/cell/Makefile
new file mode 100644
index 0000000000..47aef7b05f
--- /dev/null
+++ b/src/gallium/drivers/cell/Makefile
@@ -0,0 +1,12 @@
+# Cell Gallium driver Makefile
+
+
+default:
+	( cd spu ; make )
+	( cd ppu ; make )
+
+
+
+clean:
+	( cd spu ; make clean )
+	( cd ppu ; make clean )
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
new file mode 100644
index 0000000000..1f6860da11
--- /dev/null
+++ b/src/gallium/drivers/cell/common.h
@@ -0,0 +1,376 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Types and tokens which are common to the SPU and PPU code.
+ */
+
+
+#ifndef CELL_COMMON_H
+#define CELL_COMMON_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+
+
+/** The standard assert macro doesn't seem to work reliably */
+#define ASSERT(x) \
+   if (!(x)) { \
+      ubyte *p = NULL; \
+      fprintf(stderr, "%s:%d: %s(): assertion %s failed.\n", \
+              __FILE__, __LINE__, __FUNCTION__, #x);             \
+      *p = 0; \
+      exit(1); \
+   }
+
+
+
+#define JOIN(x, y) JOIN_AGAIN(x, y)
+#define JOIN_AGAIN(x, y) x ## y
+
+#define STATIC_ASSERT(e) \
+{typedef char JOIN(assertion_failed_at_line_, __LINE__) [(e) ? 1 : -1];}
+
+
+
+/** for sanity checking */
+#define ASSERT_ALIGN16(ptr) \
+  ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
+
+
+/** round up value to next multiple of 4 */
+#define ROUNDUP4(k)  (((k) + 0x3) & ~0x3)
+
+/** round up value to next multiple of 8 */
+#define ROUNDUP8(k)  (((k) + 0x7) & ~0x7)
+
+/** round up value to next multiple of 16 */
+#define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
+
+
+#define CELL_MAX_SPUS 8
+
+#define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+#define CELL_MAX_WIDTH 1024    /**< max framebuffer width */
+#define CELL_MAX_HEIGHT 1024   /**< max framebuffer width */
+
+#define TILE_SIZE 32
+
+
+/**
+ * The low byte of a mailbox word contains the command opcode.
+ * Remaining higher bytes are command specific.
+ */
+#define CELL_CMD_OPCODE_MASK 0xff
+
+#define CELL_CMD_EXIT                 1
+#define CELL_CMD_CLEAR_SURFACE        2
+#define CELL_CMD_FINISH               3
+#define CELL_CMD_RENDER               4
+#define CELL_CMD_BATCH                5
+#define CELL_CMD_RELEASE_VERTS        6
+#define CELL_CMD_STATE_FRAMEBUFFER   10
+#define CELL_CMD_STATE_FRAGMENT_OPS  11
+#define CELL_CMD_STATE_SAMPLER       12
+#define CELL_CMD_STATE_TEXTURE       13
+#define CELL_CMD_STATE_VERTEX_INFO   14
+#define CELL_CMD_STATE_VIEWPORT      15
+#define CELL_CMD_STATE_UNIFORMS      16
+#define CELL_CMD_STATE_VS_ARRAY_INFO 17
+#define CELL_CMD_STATE_BIND_VS       18
+#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
+#define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_STATE_FS_CONSTANTS  21
+#define CELL_CMD_STATE_RASTERIZER    22
+#define CELL_CMD_VS_EXECUTE          23
+#define CELL_CMD_FLUSH_BUFFER_RANGE  24
+#define CELL_CMD_FENCE               25
+
+
+/** Command/batch buffers */
+#define CELL_NUM_BUFFERS 4
+#define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
+
+#define CELL_BUFFER_STATUS_FREE 10
+#define CELL_BUFFER_STATUS_USED 20
+
+/** Debug flags */
+#define CELL_DEBUG_CHECKER              (1 << 0)
+#define CELL_DEBUG_ASM                  (1 << 1)
+#define CELL_DEBUG_SYNC                 (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD                  (1 << 5)
+#define CELL_DEBUG_CACHE                (1 << 6)
+
+#define CELL_FENCE_IDLE      0
+#define CELL_FENCE_EMITTED   1
+#define CELL_FENCE_SIGNALLED 2
+
+#define CELL_FACING_FRONT    0
+#define CELL_FACING_BACK     1
+
+struct cell_fence
+{
+   /** There's a 16-byte status qword per SPU */
+   volatile uint status[CELL_MAX_SPUS][4];
+};
+
+#ifdef __SPU__
+typedef vector unsigned int opcode_t;
+#else
+typedef unsigned int opcode_t[4];
+#endif
+
+/**
+ * Fence command sent to SPUs.  In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+   opcode_t opcode;      /**< CELL_CMD_FENCE */
+   struct cell_fence *fence;
+   uint32_t pad_[3];
+};
+
+
+/**
+ * Command to specify per-fragment operations state and generated code.
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code.  They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
+ */
+struct cell_command_fragment_ops
+{
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
+};
+
+
+/** Max instructions for fragment programs */
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
+
+/**
+ * Command to send a fragment program to SPUs.
+ */
+struct cell_command_fragment_program
+{
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   uint num_inst;        /**< Number of instructions */
+   uint32_t pad[3];
+   unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+};
+
+
+/**
+ * Tell SPUs about the framebuffer size, location
+ */
+struct cell_command_framebuffer
+{
+   opcode_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
+   int width, height;
+   void *color_start, *depth_start;
+   enum pipe_format color_format, depth_format;
+   uint32_t pad_[2];
+};
+
+
+/**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+   opcode_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   struct pipe_rasterizer_state rasterizer;
+};
+
+
+/**
+ * Clear framebuffer to the given value/color.
+ */
+struct cell_command_clear_surface
+{
+   opcode_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
+   uint surface; /**< Temporary: 0=color, 1=Z */
+   uint value;
+   uint32_t pad[2];
+};
+
+
+/**
+ * Array info used by the vertex shader's vertex puller.
+ */
+struct cell_array_info
+{
+   uint64_t base;      /**< Base address of the 0th element. */
+   uint attr;          /**< Attribute that this state is for. */
+   uint pitch;         /**< Byte pitch from one entry to the next. */
+   uint size;
+   uint function_offset;
+};
+
+
+struct cell_attribute_fetch_code
+{
+   uint64_t base;
+   uint size;
+};
+
+
+struct cell_buffer_range
+{
+   uint64_t base;
+   unsigned size;
+};
+
+
+struct cell_shader_info
+{
+   uint64_t declarations;
+   uint64_t instructions;
+   uint64_t  immediates;
+
+   unsigned num_outputs;
+   unsigned num_declarations;
+   unsigned num_instructions;
+   unsigned num_immediates;
+};
+
+
+#define SPU_VERTS_PER_BATCH 64
+struct cell_command_vs
+{
+   opcode_t opcode;       /**< CELL_CMD_VS_EXECUTE */
+   uint64_t vOut[SPU_VERTS_PER_BATCH];
+   unsigned num_elts;
+   unsigned elts[SPU_VERTS_PER_BATCH];
+   float plane[12][4];
+   unsigned nr_planes;
+   unsigned nr_attrs;
+};
+
+
+struct cell_command_render
+{
+   opcode_t opcode;   /**< CELL_CMD_RENDER */
+   uint prim_type;    /**< PIPE_PRIM_x */
+   uint num_verts;
+   uint vertex_size;  /**< bytes per vertex */
+   uint num_indexes;
+   uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
+   float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
+   uint min_index;
+   boolean inline_verts;
+   uint32_t pad_[1];
+};
+
+
+struct cell_command_release_verts
+{
+   opcode_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+   uint32_t pad_[3];
+};
+
+
+struct cell_command_sampler
+{
+   opcode_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   uint unit;
+   struct pipe_sampler_state state;
+   uint32_t pad_[1];
+};
+
+
+struct cell_command_texture
+{
+   opcode_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
+   uint unit;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
+};
+
+
+#define MAX_SPU_FUNCTIONS 12
+/**
+ * Used to tell the PPU about the address of particular functions in the
+ * SPU's address space.
+ */
+struct cell_spu_function_info
+{
+   uint num;
+   char names[MAX_SPU_FUNCTIONS][16];
+   uint addrs[MAX_SPU_FUNCTIONS];
+   char pad[12];   /**< Pad struct to multiple of 16 bytes (256 currently) */
+};
+
+
+/** This is the object passed to spe_create_thread() */
+struct cell_init_info
+{
+   unsigned id;
+   unsigned num_spus;
+   unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
+   float inv_timebase;    /**< 1.0/timebase, for perf measurement */
+
+   /** Buffers for command batches, vertex/index data */
+   ubyte *buffers[CELL_NUM_BUFFERS];
+   uint *buffer_status;  /**< points at cell_context->buffer_status */
+
+   struct cell_spu_function_info *spu_functions;
+} ALIGN16_ATTRIB;
+
+
+#endif /* CELL_COMMON_H */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
new file mode 100644
index 0000000000..c92f8e5cba
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -0,0 +1,86 @@
+# Gallium3D Cell driver: PPU code
+
+# This makefile builds the libcell.a library which gets pulled into
+# the main libGL.so library
+
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+
+# This is the "top-level" cell PPU driver code, will get pulled into libGL.so
+# by the winsys Makefile.
+CELL_LIB = ../libcell.a
+
+
+# This is the SPU code.  We'd like to be able to put this into the libcell.a
+# archive with the PPU code, but nesting .a libs doesn't seem to work.
+# So, it's pulled into libGL.so in gallium/winsys/xlib/Makefile
+SPU_CODE_MODULE = ../spu/g3d_spu.a
+
+
+SOURCES = \
+	cell_batch.c \
+	cell_clear.c \
+	cell_context.c \
+	cell_draw_arrays.c \
+	cell_fence.c \
+	cell_flush.c \
+	cell_gen_fragment.c \
+	cell_gen_fp.c \
+	cell_state_derived.c \
+	cell_state_emit.c \
+	cell_state_shader.c \
+	cell_pipe_state.c \
+	cell_screen.c \
+	cell_state_vertex.c \
+	cell_spu.c \
+	cell_surface.c \
+	cell_texture.c \
+	cell_vbuf.c \
+	cell_vertex_fetch.c \
+	cell_vertex_shader.c
+
+
+OBJECTS = $(SOURCES:.c=.o) \
+
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers
+
+.c.o:
+	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
+
+
+.c.s:
+	$(CC) -S $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
+
+
+default: $(CELL_LIB)
+
+
+$(CELL_LIB): $(OBJECTS) $(SPU_CODE_MODULE)
+#	ar -ru $(CELL_LIB) $(OBJECTS) $(SPU_CODE_MODULE) # doesn't work
+	ar -ru $(CELL_LIB) $(OBJECTS)
+
+#$(PROG): $(PPU_OBJECTS)
+#	$(CC) -o $(PROG) $(PPU_OBJECTS) $(SPU_CODE_MODULE) $(PPU_LFLAGS)
+
+
+
+clean:
+	rm -f *.o *~ $(CELL_LIB)
+
+
+
+depend: $(SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null
+
+include depend
+
+
+
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
new file mode 100644
index 0000000000..fe144f8b84
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -0,0 +1,260 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_spu.h"
+
+
+
+/**
+ * Search the buffer pool for an empty/free buffer and return its index.
+ * Buffers are used for storing vertex data, state and commands which
+ * will be sent to the SPUs.
+ * If no empty buffers are available, wait for one.
+ * \return buffer index in [0, CELL_NUM_BUFFERS-1]
+ */
+uint
+cell_get_empty_buffer(struct cell_context *cell)
+{
+   static uint prev_buffer = 0;
+   uint buf = (prev_buffer + 1) % CELL_NUM_BUFFERS;
+   uint tries = 0;
+
+   /* Find a buffer that's marked as free by all SPUs */
+   while (1) {
+      uint spu, num_free = 0;
+
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) {
+            num_free++;
+
+            if (num_free == cell->num_spus) {
+               /* found a free buffer, now mark status as used */
+               for (spu = 0; spu < cell->num_spus; spu++) {
+                  cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+               }
+               /*
+               printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
+               */
+               prev_buffer = buf;
+
+               /* release tex buffer associated w/ prev use of this batch buf */
+               cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
+               return buf;
+            }
+         }
+         else {
+            break;
+         }
+      }
+
+      /* try next buf */
+      buf = (buf + 1) % CELL_NUM_BUFFERS;
+
+      tries++;
+      if (tries == 100) {
+         /*
+         printf("PPU WAITING for buffer...\n");
+         */
+      }
+   }
+}
+
+
+/**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+   const uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   struct cell_command_fence *fence_cmd;
+   struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+   uint i;
+
+   /* set fence status to emitted, not yet signalled */
+   for (i = 0; i < cell->num_spus; i++) {
+      fence->status[i][0] = CELL_FENCE_EMITTED;
+   }
+
+   STATIC_ASSERT(sizeof(struct cell_command_fence) % 16 == 0);
+   ASSERT(size % 16 == 0);
+   ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+   fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+   fence_cmd->opcode[0] = CELL_CMD_FENCE;
+   fence_cmd->fence = fence;
+
+   /* update batch buffer size */
+   cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+}
+
+
+/**
+ * Flush the current batch buffer to the SPUs.
+ * An empty buffer will be found and set as the new current batch buffer
+ * for subsequent commands/data.
+ */
+void
+cell_batch_flush(struct cell_context *cell)
+{
+   static boolean flushing = FALSE;
+   uint batch = cell->cur_batch;
+   uint size = cell->buffer_size[batch];
+   uint spu, cmd_word;
+
+   assert(!flushing);
+
+   if (size == 0)
+      return;
+
+   /* Before we use this batch buffer, make sure any fenced texture buffers
+    * are released.
+    */
+   if (cell->fenced_buffers[batch].head) {
+      emit_fence(cell);
+      size = cell->buffer_size[batch];
+   }
+
+   flushing = TRUE;
+
+   assert(batch < CELL_NUM_BUFFERS);
+
+   /*
+   printf("cell_batch_dispatch: buf %u at %p, size %u\n",
+          batch, &cell->buffer[batch][0], size);
+   */
+     
+   /*
+    * Build "BATCH" command and send to all SPUs.
+    */
+   cmd_word = CELL_CMD_BATCH | (batch << 8) | (size << 16);
+
+   for (spu = 0; spu < cell->num_spus; spu++) {
+      assert(cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_USED);
+      send_mbox_message(cell_global.spe_contexts[spu], cmd_word);
+   }
+
+   /* When the SPUs are done copying the buffer into their locals stores
+    * they'll write a BUFFER_STATUS_FREE message into the buffer_status[]
+    * array indicating that the PPU can re-use the buffer.
+    */
+
+   batch = cell_get_empty_buffer(cell);
+
+   cell->buffer_size[batch] = 0;  /* empty */
+   cell->cur_batch = batch;
+
+   flushing = FALSE;
+}
+
+
+/**
+ * Return the number of bytes free in the current batch buffer.
+ */
+uint
+cell_batch_free_space(const struct cell_context *cell)
+{
+   uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   free -= sizeof(struct cell_command_fence);
+   return free;
+}
+
+
+/**
+ * Allocate space in the current batch buffer for 'bytes' space.
+ * Bytes must be a multiple of 16 bytes.  Allocation will be 16 byte aligned.
+ * \return address in batch buffer to put data
+ */
+void *
+cell_batch_alloc16(struct cell_context *cell, uint bytes)
+{
+   void *pos;
+   uint size;
+
+   ASSERT(bytes % 16 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
+
+   size = cell->buffer_size[cell->cur_batch];
+
+   if (bytes > cell_batch_free_space(cell)) {
+      cell_batch_flush(cell);
+      size = 0;
+   }
+
+   ASSERT(size % 16 == 0);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
+
+   pos = (void *) (cell->buffer[cell->cur_batch] + size);
+
+   cell->buffer_size[cell->cur_batch] = size + bytes;
+
+   return pos;
+}
+
+
+/**
+ * One-time init of batch buffers.
+ */
+void
+cell_init_batch_buffers(struct cell_context *cell)
+{
+   uint spu, buf;
+
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
+
+      /* init batch buffer status values,
+       * mark 0th buffer as used, rest as free.
+       */
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (buf == 0)
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+         else
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_FREE;
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
new file mode 100644
index 0000000000..290136031a
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_BATCH_H
+#define CELL_BATCH_H
+
+#include "pipe/p_compiler.h"
+
+
+struct cell_context;
+
+
+extern uint
+cell_get_empty_buffer(struct cell_context *cell);
+
+extern void
+cell_batch_flush(struct cell_context *cell);
+
+extern uint
+cell_batch_free_space(const struct cell_context *cell);
+
+extern void *
+cell_batch_alloc16(struct cell_context *cell, uint bytes);
+
+extern void
+cell_init_batch_buffers(struct cell_context *cell);
+
+
+#endif /* CELL_BATCH_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
new file mode 100644
index 0000000000..c2e276988c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -0,0 +1,123 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Authors
+ *  Brian Paul
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdint.h>
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "cell/common.h"
+#include "cell_clear.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_state.h"
+
+
+/**
+ * Convert packed pixel from one format to another.
+ */
+static unsigned
+convert_color(enum pipe_format srcFormat, unsigned srcColor,
+              enum pipe_format dstFormat)
+{
+   ubyte r, g, b, a;
+   unsigned dstColor;
+
+   util_unpack_color_ub(srcFormat, &srcColor, &r, &g, &b, &a);
+   util_pack_color_ub(r, g, b, a, dstFormat, &dstColor);
+
+   return dstColor;
+}
+
+
+
+/**
+ * Called via pipe->clear()
+ */
+void
+cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
+                   unsigned clearValue)
+{
+   struct pipe_screen *screen = pipe->screen;
+   struct cell_context *cell = cell_context(pipe);
+   uint surfIndex;
+
+   if (cell->dirty)
+      cell_update_derived(cell);
+
+
+   if (!cell->cbuf_map[0])
+      cell->cbuf_map[0] = screen->surface_map(screen, ps,
+                                              PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   if (ps == cell->framebuffer.zsbuf) {
+      /* clear z/stencil buffer */
+      surfIndex = 1;
+   }
+   else {
+      /* clear color buffer */
+      surfIndex = 0;
+
+      if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) {
+         clearValue = convert_color(PIPE_FORMAT_A8R8G8B8_UNORM, clearValue,
+                                    ps->format);
+      }
+   }
+
+
+   /* Build a CLEAR command and place it in the current batch buffer */
+   {
+      STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0);
+      struct cell_command_clear_surface *clr
+         = (struct cell_command_clear_surface *)
+         cell_batch_alloc16(cell, sizeof(*clr));
+      clr->opcode[0] = CELL_CMD_CLEAR_SURFACE;
+      clr->surface = surfIndex;
+      clr->value = clearValue;
+   }
+
+   /* Technically, the surface's contents are now known and cleared,
+    * so we could set the status to PIPE_SURFACE_STATUS_CLEAR.  But
+    * it turns out it's quite painful to recognize when any particular
+    * surface goes from PIPE_SURFACE_STATUS_CLEAR to 
+    * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because
+    * the drawing commands could be operating on numerous draw buffers,
+    * which we'd have to iterate through to set all their stati...
+    * For now, we cheat a bit and set the surface's status to DEFINED
+    * right here.  Later we should revisit this and set the status to
+    * CLEAR here, and find a better place to set the status to DEFINED.
+    */
+   ps->status = PIPE_SURFACE_STATUS_DEFINED;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.h b/src/gallium/drivers/cell/ppu/cell_clear.h
new file mode 100644
index 0000000000..ff47d43f4c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_clear.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_CLEAR_H
+#define CELL_CLEAR_H
+
+
+struct pipe_context;
+struct pipe_surface;
+
+
+extern void
+cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
+                   unsigned clearValue);
+
+
+
+#endif /* CELL_CLEAR_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
new file mode 100644
index 0000000000..ae82ded334
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -0,0 +1,183 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include <stdio.h>
+
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "util/u_memory.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_screen.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+#include "cell/common.h"
+#include "cell_batch.h"
+#include "cell_clear.h"
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_fence.h"
+#include "cell_flush.h"
+#include "cell_state.h"
+#include "cell_surface.h"
+#include "cell_spu.h"
+#include "cell_pipe_state.h"
+#include "cell_texture.h"
+#include "cell_vbuf.h"
+
+
+
+static void
+cell_destroy_context( struct pipe_context *pipe )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   util_delete_keymap(cell->fragment_ops_cache, NULL);
+
+   cell_spu_exit(cell);
+
+   align_free(cell);
+}
+
+
+static struct draw_context *
+cell_draw_create(struct cell_context *cell)
+{
+   struct draw_context *draw = draw_create();
+
+#if 0 /* broken */
+   if (getenv("GALLIUM_CELL_VS")) {
+      /* plug in SPU-based vertex transformation code */
+      draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+      draw->driver_private = cell;
+   }
+#endif
+
+   return draw;
+}
+
+
+static const struct debug_named_value cell_debug_flags[] = {
+   {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"asm", CELL_DEBUG_ASM},        /**< dump SPU asm code */
+   {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+   {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+   {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
+   {"cache", CELL_DEBUG_CACHE},   /**< report texture cache stats on exit */
+   {NULL, 0}
+};
+
+
+struct pipe_context *
+cell_create_context(struct pipe_screen *screen,
+                    struct cell_winsys *cws)
+{
+   struct cell_context *cell;
+   uint i;
+
+   /* some fields need to be 16-byte aligned, so align the whole object */
+   cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
+   if (!cell)
+      return NULL;
+
+   memset(cell, 0, sizeof(*cell));
+
+   cell->winsys = cws;
+   cell->pipe.winsys = screen->winsys;
+   cell->pipe.screen = screen;
+   cell->pipe.destroy = cell_destroy_context;
+
+   cell->pipe.clear = cell_clear_surface;
+   cell->pipe.flush = cell_flush;
+
+#if 0
+   cell->pipe.begin_query = cell_begin_query;
+   cell->pipe.end_query = cell_end_query;
+   cell->pipe.wait_query = cell_wait_query;
+#endif
+
+   cell_init_draw_functions(cell);
+   cell_init_state_functions(cell);
+   cell_init_shader_functions(cell);
+   cell_init_surface_functions(cell);
+   cell_init_vertex_functions(cell);
+
+   cell->draw = cell_draw_create(cell);
+
+   /* Create cache of fragment ops generated code */
+   cell->fragment_ops_cache =
+      util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
+   cell_init_vbuf(cell);
+
+   draw_set_rasterize_stage(cell->draw, cell->vbuf);
+
+   /* convert all points/lines to tris for the time being */
+   draw_wide_point_threshold(cell->draw, 0.0);
+   draw_wide_line_threshold(cell->draw, 0.0);
+
+   /* get env vars or read config file to get debug flags */
+   cell->debug_flags = debug_get_flags_option("CELL_DEBUG", 
+                                              cell_debug_flags, 
+                                              0 );
+
+   for (i = 0; i < CELL_NUM_BUFFERS; i++)
+      cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
+   /*
+    * SPU stuff
+    */
+   /* This call only works with SDK 3.0.  Anyone still using 2.1??? */
+   cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
+   cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
+   if (cell->debug_flags) {
+      printf("Cell: found %d Cell(s) with %u SPUs\n",
+             cell->num_cells, cell->num_spus);
+   }
+   if (getenv("CELL_NUM_SPUS")) {
+      cell->num_spus = atoi(getenv("CELL_NUM_SPUS"));
+      assert(cell->num_spus > 0);
+   }
+
+   cell_start_spus(cell);
+
+   cell_init_batch_buffers(cell);
+
+   /* make sure SPU initializations are done before proceeding */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+
+   return &cell->pipe;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
new file mode 100644
index 0000000000..eb1397bb3f
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -0,0 +1,205 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_CONTEXT_H
+#define CELL_CONTEXT_H
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_vbuf.h"
+#include "cell_winsys.h"
+#include "cell/common.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
+
+
+struct cell_vbuf_render;
+
+
+/**
+ * Cell vertex shader state, subclass of pipe_shader_state.
+ */
+struct cell_vertex_shader_state
+{
+   struct pipe_shader_state shader;
+   struct tgsi_shader_info info;
+   void *draw_data;
+};
+
+
+/**
+ * Cell fragment shader state, subclass of pipe_shader_state.
+ */
+struct cell_fragment_shader_state
+{
+   struct pipe_shader_state shader;
+   struct tgsi_shader_info info;
+   struct spe_function code;
+   void *data;
+};
+
+
+/**
+ * Key for mapping per-fragment state to cached SPU machine code.
+ *  keymap(cell_fragment_ops_key) => cell_command_fragment_ops
+ */
+struct cell_fragment_ops_key
+{
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+   struct pipe_depth_stencil_alpha_state dsa;
+   enum pipe_format color_format;
+   enum pipe_format zs_format;
+};
+
+
+struct cell_buffer_node;
+
+/**
+ * Fenced buffer list.  List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
+ */
+struct cell_buffer_list
+{
+   struct cell_fence fence ALIGN16_ATTRIB;
+   struct cell_buffer_node *head;
+};
+
+
+/**
+ * Per-context state, subclass of pipe_context.
+ */
+struct cell_context
+{
+   struct pipe_context pipe;
+
+   struct cell_winsys *winsys;
+
+   const struct pipe_blend_state *blend;
+   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   uint num_samplers;
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
+   const struct pipe_rasterizer_state *rasterizer;
+   const struct cell_vertex_shader_state *vs;
+   const struct cell_fragment_shader_state *fs;
+
+   struct spe_function logic_op;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer constants[2];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct cell_texture *texture[PIPE_MAX_SAMPLERS];
+   uint num_textures;
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   uint num_vertex_buffers;
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+   uint num_vertex_elements;
+
+   ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
+   ubyte *zsbuf_map;
+
+   struct pipe_surface *tex_surf;
+   uint *tex_map;
+
+   uint dirty;
+   uint dirty_textures;  /* bitmask of texture units */
+   uint dirty_samplers;  /* bitmask of sampler units */
+
+   /** Cache of code generated for per-fragment ops */
+   struct keymap *fragment_ops_cache;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+   struct draw_stage *render_stage;
+
+   /** For post-transformed vertex buffering: */
+   struct cell_vbuf_render *vbuf_render;
+   struct draw_stage *vbuf;
+
+   struct vertex_info vertex_info;
+
+   /** Mapped constant buffers */
+   void *mapped_constants[PIPE_SHADER_TYPES];
+
+   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
+
+   uint num_cells, num_spus;
+
+   /** Buffers for command batches, vertex/index data */
+   uint buffer_size[CELL_NUM_BUFFERS];
+   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+
+   int cur_batch;  /**< which buffer is being filled w/ commands */
+
+   /** [4] to ensure 16-byte alignment for each status word */
+   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
+
+
+   /** Associated with each command/batch buffer is a list of pipe_buffers
+    * that are fenced.  When the last command in a buffer is executed, the
+    * fence will be signalled, indicating that any pipe_buffers preceeding
+    * that fence can be unreferenced (and probably freed).
+    */
+   struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
+   struct spe_function attrib_fetch;
+   unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
+
+   unsigned debug_flags;
+};
+
+
+
+
+static INLINE struct cell_context *
+cell_context(struct pipe_context *pipe)
+{
+   return (struct cell_context *) pipe;
+}
+
+
+extern struct pipe_context *
+cell_create_context(struct pipe_screen *screen, struct cell_winsys *cws);
+
+extern void
+cell_vertex_shader_queue_flush(struct draw_context *draw);
+
+
+/* XXX find a better home for this */
+extern void cell_update_vertex_fetch(struct draw_context *draw);
+
+
+#endif /* CELL_CONTEXT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
new file mode 100644
index 0000000000..644496db40
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -0,0 +1,191 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_state.h"
+#include "cell_flush.h"
+
+#include "draw/draw_context.h"
+
+
+
+static void
+cell_map_constant_buffers(struct cell_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i;
+   for (i = 0; i < 2; i++) {
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size) {
+         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
+                                                   PIPE_BUFFER_USAGE_CPU_READ);
+         cell_flush_buffer_range(sp, sp->mapped_constants[i], 
+                                 sp->constants[i].buffer->size);
+      }
+   }
+
+   draw_set_mapped_constant_buffer(sp->draw,
+                                   sp->mapped_constants[PIPE_SHADER_VERTEX],
+                                   sp->constants[PIPE_SHADER_VERTEX].buffer->size);
+}
+
+static void
+cell_unmap_constant_buffers(struct cell_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i;
+   for (i = 0; i < 2; i++) {
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
+         ws->buffer_unmap(ws, sp->constants[i].buffer);
+      sp->mapped_constants[i] = NULL;
+   }
+}
+
+
+
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ *
+ * XXX should the element buffer be specified/bound with a separate function?
+ */
+static boolean
+cell_draw_range_elements(struct pipe_context *pipe,
+                         struct pipe_buffer *indexBuffer,
+                         unsigned indexSize,
+                         unsigned min_index,
+                         unsigned max_index,
+                         unsigned mode, unsigned start, unsigned count)
+{
+   struct cell_context *sp = cell_context(pipe);
+   struct draw_context *draw = sp->draw;
+   unsigned i;
+
+   if (sp->dirty)
+      cell_update_derived( sp );
+
+#if 0
+   cell_map_surfaces(sp);
+#endif
+   cell_map_constant_buffers(sp);
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      void *buf = pipe_buffer_map(pipe->screen,
+                                           sp->vertex_buffer[i].buffer,
+                                           PIPE_BUFFER_USAGE_CPU_READ);
+      cell_flush_buffer_range(sp, buf, sp->vertex_buffer[i].buffer->size);
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes = pipe_buffer_map(pipe->screen,
+                                                      indexBuffer,
+                                                      PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+   }
+
+
+   /* draw! */
+   draw_arrays(draw, mode, start, count);
+
+   /*
+    * unmap vertex/index buffers - will cause draw module to flush
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+      pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
+   }
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
+   }
+
+   /* Note: leave drawing surfaces mapped */
+   cell_unmap_constant_buffers(sp);
+
+   return TRUE;
+}
+
+
+static boolean
+cell_draw_elements(struct pipe_context *pipe,
+                   struct pipe_buffer *indexBuffer,
+                   unsigned indexSize,
+                   unsigned mode, unsigned start, unsigned count)
+{
+   return cell_draw_range_elements( pipe, indexBuffer,
+                                    indexSize,
+                                    0, 0xffffffff,
+                                    mode, start, count );
+}
+
+
+static boolean
+cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
+
+static void
+cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
+{
+   struct cell_context *cell = cell_context(pipe);
+   draw_set_edgeflags(cell->draw, edgeflags);
+}
+
+
+
+void
+cell_init_draw_functions(struct cell_context *cell)
+{
+   cell->pipe.draw_arrays = cell_draw_arrays;
+   cell->pipe.draw_elements = cell_draw_elements;
+   cell->pipe.draw_range_elements = cell_draw_range_elements;
+   cell->pipe.set_edgeflags = cell_set_edgeflags;
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
new file mode 100644
index 0000000000..148873aa67
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
@@ -0,0 +1,36 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_DRAW_ARRAYS_H
+#define CELL_DRAW_ARRAYS_H
+
+
+extern void
+cell_init_draw_functions(struct cell_context *cell);
+
+
+#endif /* CELL_DRAW_ARRAYS_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 0000000000..867b5dcaa0
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,168 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+   uint i;
+   ASSERT_ALIGN16(fence->status);
+   for (i = 0; i < CELL_MAX_SPUS; i++) {
+      fence->status[i][0] = CELL_FENCE_IDLE;
+   }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < cell->num_spus; i++) {
+      if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
+         return FALSE;
+      /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
+   }
+   return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence)
+{
+   while (!cell_fence_signalled(cell, fence)) {
+      usleep(10);
+   }
+
+#ifdef DEBUG
+   {
+      uint i;
+      for (i = 0; i < cell->num_spus; i++) {
+         assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+      }
+   }
+#endif
+}
+
+
+
+
+struct cell_buffer_node
+{
+   struct pipe_buffer *buffer;
+   struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+                        struct cell_buffer_list *list,
+                        struct pipe_buffer *buffer)
+{
+   struct pipe_screen *ps = cell->pipe.screen;
+   struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+   /* create new list node which references the buffer, insert at head */
+   if (node) {
+      pipe_buffer_reference(ps, &node->buffer, buffer);
+      node->next = list->head;
+      list->head = node;
+   }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list)
+{
+   if (list->head) {
+      struct pipe_screen *ps = cell->pipe.screen;
+      struct cell_buffer_node *node;
+
+      cell_fence_finish(cell, &list->fence);
+
+      /* traverse the list, unreferencing buffers, freeing nodes */
+      node = list->head;
+      while (node) {
+         struct cell_buffer_node *next = node->next;
+         assert(node->buffer);
+         pipe_buffer_unmap(ps, node->buffer);
+#if 0
+         printf("Unref buffer %p\n", node->buffer);
+         if (node->buffer->refcount == 1)
+            printf("   Delete!\n");
+#endif
+         pipe_buffer_reference(ps, &node->buffer, NULL);
+         FREE(node);
+         node = next;
+      }
+      list->head = NULL;
+   }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+   struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+   uint i;
+
+   for (i = 0; i < cell->num_textures; i++) {
+      struct cell_texture *ct = cell->texture[i];
+      if (ct) {
+         uint level;
+         for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+            if (ct->tiled_buffer[level]) {
+#if 0
+               printf("Adding texture %p buffer %p to list\n",
+                      ct, ct->tiled_buffer[level]);
+#endif
+               cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+            }
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 0000000000..536b4ba411
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
new file mode 100644
index 0000000000..8275c9dc9c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -0,0 +1,112 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_render.h"
+#include "draw/draw_context.h"
+
+
+/**
+ * Called via pipe->flush()
+ */
+void
+cell_flush(struct pipe_context *pipe, unsigned flags,
+           struct pipe_fence_handle **fence)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   if (fence) {
+      *fence = NULL;
+      /* XXX: Implement real fencing */
+      flags |= CELL_FLUSH_WAIT;
+   }
+
+   if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
+      flags |= CELL_FLUSH_WAIT;
+
+   draw_flush( cell->draw );
+   cell_flush_int(cell, flags);
+}
+
+
+/**
+ * Cell internal flush function.  Send the current batch buffer to all SPUs.
+ * If flags & CELL_FLUSH_WAIT, do not return until the SPUs are idle.
+ * \param flags  bitmask of flags CELL_FLUSH_WAIT, or zero
+ */
+void
+cell_flush_int(struct cell_context *cell, unsigned flags)
+{
+   static boolean flushing = FALSE;  /* recursion catcher */
+   uint i;
+
+   ASSERT(!flushing);
+   flushing = TRUE;
+
+   if (flags & CELL_FLUSH_WAIT) {
+      STATIC_ASSERT(sizeof(opcode_t) % 16 == 0);
+      opcode_t *cmd = (opcode_t*) cell_batch_alloc16(cell, sizeof(opcode_t));
+      *cmd[0] = CELL_CMD_FINISH;
+   }
+
+   cell_batch_flush(cell);
+
+#if 0
+   /* Send CMD_FINISH to all SPUs */
+   for (i = 0; i < cell->num_spus; i++) {
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_FINISH);
+   }
+#endif
+
+   if (flags & CELL_FLUSH_WAIT) {
+      /* Wait for ack */
+      for (i = 0; i < cell->num_spus; i++) {
+         uint k = wait_mbox_message(cell_global.spe_contexts[i]);
+         assert(k == CELL_CMD_FINISH);
+      }
+   }
+
+   flushing = FALSE;
+}
+
+
+void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size)
+{
+   STATIC_ASSERT((sizeof(opcode_t) + sizeof(struct cell_buffer_range)) % 16 == 0);
+   uint32_t *batch = (uint32_t*)cell_batch_alloc16(cell, 
+      sizeof(opcode_t) + sizeof(struct cell_buffer_range));
+   struct cell_buffer_range *br = (struct cell_buffer_range *) &batch[4];
+   batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
+   br->base = (uintptr_t) ptr;
+   br->size = size;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.h b/src/gallium/drivers/cell/ppu/cell_flush.h
new file mode 100644
index 0000000000..509ae6239a
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_flush.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FLUSH
+#define CELL_FLUSH
+
+#define CELL_FLUSH_WAIT 0x80000000
+
+extern void
+cell_flush(struct pipe_context *pipe, unsigned flags,
+           struct pipe_fence_handle **fence);
+
+extern void
+cell_flush_int(struct cell_context *cell, unsigned flags);
+
+extern void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size);
+
+#endif
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
new file mode 100644
index 0000000000..5a889a6119
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -0,0 +1,2046 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU fragment program/shader code.
+ *
+ * Note that we generate SOA-style code here.  So each TGSI instruction
+ * operates on four pixels (and is translated into four SPU instructions,
+ * generally speaking).
+ *
+ * \author Brian Paul
+ */
+
+#include <math.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fp.h"
+
+
+#define MAX_TEMPS 16
+#define MAX_IMMED  8
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+/**
+ * Context needed during code generation.
+ */
+struct codegen
+{
+   struct cell_context *cell;
+   int inputs_reg;      /**< 1st function parameter */
+   int outputs_reg;     /**< 2nd function parameter */
+   int constants_reg;   /**< 3rd function parameter */
+   int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+   int imm_regs[MAX_IMMED][4];  /**< maps TGSI immediates to SPE registers */
+
+   int num_imm;  /**< number of immediates */
+
+   int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
+
+   int addr_reg;        /**< address register, integer values */
+
+   /** Per-instruction temps / intermediate temps */
+   int num_itemps;
+   int itemps[12];
+
+   /** Current IF/ELSE/ENDIF nesting level */
+   int if_nesting;
+   /** Current BGNLOOP/ENDLOOP nesting level */
+   int loop_nesting;
+   /** Location of start of current loop */
+   int loop_start;
+
+   /** Index of if/conditional mask register */
+   int cond_mask_reg;
+   /** Index of loop mask register */
+   int loop_mask_reg;
+
+   /** Index of master execution mask register */
+   int exec_mask_reg;
+
+   /** KIL mask: indicates which fragments have been killed */
+   int kill_mask_reg;
+
+   int frame_size;  /**< Stack frame size, in words */
+
+   struct spe_function *f;
+   boolean error;
+};
+
+
+/**
+ * Allocate an intermediate temporary register.
+ */
+static int
+get_itemp(struct codegen *gen)
+{
+   int t = spe_allocate_available_register(gen->f);
+   assert(gen->num_itemps < Elements(gen->itemps));
+   gen->itemps[gen->num_itemps++] = t;
+   return t;
+}
+
+/**
+ * Free all intermediate temporary registers.  To be called after each
+ * instruction has been emitted.
+ */
+static void
+free_itemps(struct codegen *gen)
+{
+   int i;
+   for (i = 0; i < gen->num_itemps; i++) {
+      spe_release_register(gen->f, gen->itemps[i]);
+   }
+   gen->num_itemps = 0;
+}
+
+
+/**
+ * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
+ * The register is allocated and initialized upon the first call.
+ */
+static int
+get_const_one_reg(struct codegen *gen)
+{
+   if (gen->one_reg <= 0) {
+      gen->one_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "init constant reg = 1.0:");
+
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->one_reg;
+}
+
+
+/**
+ * Return index of the address register.
+ * Used for indirect register loads/stores.
+ */
+static int
+get_address_reg(struct codegen *gen)
+{
+   if (gen->addr_reg <= 0) {
+      gen->addr_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "init address reg = 0:");
+
+      /* init addr = {0, 0, 0, 0} */
+      spe_zero(gen->f, gen->addr_reg);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->addr_reg;
+}
+
+
+/**
+ * Return index of the master execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The master execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+   if (gen->exec_mask_reg <= 0) {
+      gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+      /* XXX this may not be needed */
+      spe_comment(gen->f, 0*-4, "initialize master execution mask = ~0");
+      spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+   }
+
+   return gen->exec_mask_reg;
+}
+
+
+/** Return index of the conditional (if/else) execution mask register */
+static int
+get_cond_mask_reg(struct codegen *gen)
+{
+   if (gen->cond_mask_reg <= 0) {
+      gen->cond_mask_reg = spe_allocate_available_register(gen->f);
+   }
+
+   return gen->cond_mask_reg;
+}
+
+
+/** Return index of the loop execution mask register */
+static int
+get_loop_mask_reg(struct codegen *gen)
+{
+   if (gen->loop_mask_reg <= 0) {
+      gen->loop_mask_reg = spe_allocate_available_register(gen->f);
+   }
+
+   return gen->loop_mask_reg;
+}
+
+
+
+static boolean
+is_register_src(struct codegen *gen, int channel,
+                const struct tgsi_full_src_register *src)
+{
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+
+   if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
+      return FALSE;
+   }
+   if (src->SrcRegister.File == TGSI_FILE_TEMPORARY ||
+       src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+      return TRUE;
+   }
+   return FALSE;
+}
+
+  
+static boolean
+is_memory_dst(struct codegen *gen, int channel,
+              const struct tgsi_full_dst_register *dst)
+{
+   if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+      return TRUE;
+   }
+   else {
+      return FALSE;
+   }
+}
+
+  
+/**
+ * Return the index of the SPU temporary containing the named TGSI
+ * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
+ * just return the corresponding SPE register.  If the TGIS register
+ * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
+ * and emit an SPE load instruction.
+ */
+static int
+get_src_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_src_register *src)
+{
+   int reg = -1;
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   boolean reg_is_itemp = FALSE;
+   uint sign_op;
+
+   assert(swizzle >= TGSI_SWIZZLE_X);
+   assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
+
+   if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+      /* Load const one float and early out */
+      reg = get_const_one_reg(gen);
+   }
+   else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+      /* Load const zero float and early out */
+      reg = get_itemp(gen);
+      spe_xor(gen->f, reg, reg, reg);
+   }
+   else {
+      int index = src->SrcRegister.Index;
+
+      assert(swizzle < 4);
+
+      if (src->SrcRegister.Indirect) {
+         /* XXX unfinished */
+      }
+
+      switch (src->SrcRegister.File) {
+      case TGSI_FILE_TEMPORARY:
+         reg = gen->temp_regs[index][swizzle];
+         break;
+      case TGSI_FILE_INPUT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         reg = gen->imm_regs[index][swizzle];
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   /*
+    * Handle absolute value, negate or set-negative of src register.
+    */
+   sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+   if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+      /*
+       * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+       */
+      const int bit31mask_reg = get_itemp(gen);
+      int result_reg;
+
+      if (reg_is_itemp) {
+         /* re-use 'reg' for the result */
+         result_reg = reg;
+      }
+      else {
+         /* alloc a new reg for the result */
+         result_reg = get_itemp(gen);
+      }
+
+      /* mask with bit 31 set, the rest cleared */
+      spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+
+      if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+         spe_andc(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else if (sign_op == TGSI_UTIL_SIGN_SET) {
+         spe_and(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else {
+         assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+         spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+      }
+
+      reg = result_reg;
+   }
+
+   return reg;
+}
+
+
+/**
+ * Return the index of an SPE register to use for the given TGSI register.
+ * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
+ * corresponding SPE register is returned.  If the TGSI register is
+ * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
+ * See store_dest_reg() below...
+ */
+static int
+get_dst_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_dst_register *dest)
+{
+   int reg = -1;
+
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (gen->if_nesting > 0 || gen->loop_nesting > 0)
+         reg = get_itemp(gen);
+      else
+         reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      break;
+   case TGSI_FILE_OUTPUT:
+      reg = get_itemp(gen);
+      break;
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * When a TGSI instruction is writing to an output register, this
+ * function emits the SPE store instruction to store the value_reg.
+ * \param value_reg  the SPE register containing the value to store.
+ *                   This would have been returned by get_dst_reg().
+ */
+static void
+store_dest_reg(struct codegen *gen,
+               int value_reg, int channel,
+               const struct tgsi_full_dst_register *dest)
+{
+   /*
+    * XXX need to implement dst reg clamping/saturation
+    */
+#if 0
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      break;
+   default:
+      assert( 0 );
+   }
+#endif
+
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
+         int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+         int exec_reg = get_exec_mask_reg(gen);
+         /* Mix d with new value according to exec mask:
+          * d[i] = mask_reg[i] ? value_reg : d_reg
+          */
+         spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+      }
+      else {
+         /* we're not inside a condition or loop: do nothing special */
+
+      }
+      break;
+   case TGSI_FILE_OUTPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = dest->DstRegister.Index * 4 + channel;
+         if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
+            int exec_reg = get_exec_mask_reg(gen);
+            int curval_reg = get_itemp(gen);
+            /* First read the current value from memory:
+             * Load:  curval = memory[(machine_reg) + offset]
+             */
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+            /* Mix curval with newvalue according to exec mask:
+             * d[i] = mask_reg[i] ? value_reg : d_reg
+             */
+            spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+            /* Store: memory[(machine_reg) + offset] = curval */
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+         }
+         else {
+            /* Store: memory[(machine_reg) + offset] = reg */
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 1024; /* XXX temporary, should be dynamic */
+
+   spe_comment(gen->f, 0, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      int sp_reg = spe_allocate_available_register(gen->f);
+      /* offset = -framesize */
+      spe_load_int(gen->f, offset_reg, -gen->frame_size);
+      /* sp = $sp */
+      spe_move(gen->f, sp_reg, SPE_REG_SP);
+      /* $sp = $sp + offset_reg */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* save $sp in stack frame */
+      spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+      spe_release_register(gen->f, sp_reg);
+   }
+   else {
+      /* save stack pointer    # stqd $sp,-frameSize($sp) */
+      spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+      /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+   }
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   const int return_reg = 3;
+
+   spe_comment(gen->f, 0, "Function epilogue:");
+
+   spe_comment(gen->f, 0, "return the killed mask");
+   if (gen->kill_mask_reg > 0) {
+      /* shader called KIL, return the "alive" mask */
+      spe_move(gen->f, return_reg, gen->kill_mask_reg);
+   }
+   else {
+      /* return {0,0,0,0} */
+      spe_load_uint(gen->f, return_reg, 0);
+   }
+
+   spe_comment(gen->f, 0, "restore stack and return");
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      /* offset = framesize */
+      spe_load_int(gen->f, offset_reg, gen->frame_size);
+      /* $sp = $sp + offset */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+   }
+   else {
+      /* restore stack pointer    # ai $sp,$sp,frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+   }
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
+#define FOR_EACH_ENABLED_CHANNEL(inst, ch) \
+   for (ch = 0; ch < 4; ch++) \
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch))
+
+
+static boolean
+emit_ARL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch = 0, src_reg, addr_reg;
+
+   src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+   addr_reg = get_address_reg(gen);
+
+   /* convert float to int */
+   spe_cflts(gen->f, addr_reg, src_reg, 0);
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, src_reg[4], dst_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
+          is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+         /* special-case: register to memory store */
+         store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+      else {
+         spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+         store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+/**
+ * Emit binary operation
+ */
+static boolean
+emit_binop(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
+   /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   /* Loop over Red/Green/Blue/Alpha channels, do the op, store results */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      /* Emit actual SPE instruction: d = s1 + s2 */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_SUB:
+         spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_MUL:
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      default:
+         ;
+      }
+   }
+
+   /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   /* Free any intermediate temps we allocated */
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+/**
+ * Emit multiply add.  See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit linear interpolate.  See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
+
+   /* setup/get src/dst/temp regs */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   /* d = s3 + s1(s2 - s3) */
+   /* do all subtracts, then all fma, then all stores to better pipeline */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+
+/**
+ * Emit reciprocal or recip sqrt.
+ */
+static boolean
+emit_RCP_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4], tmp_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (inst->Instruction.Opcode == TGSI_OPCODE_RCP) {
+         /* tmp = 1/s1 */
+         spe_frest(gen->f, tmp_reg[ch], s1_reg[ch]);
+      }
+      else {
+         /* tmp = 1/sqrt(s1) */
+         spe_frsqest(gen->f, tmp_reg[ch], s1_reg[ch]);
+      }
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      /* d = float_interp(s1, tmp) */
+      spe_fi(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit absolute value.  See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4];
+   const int bit31mask_reg = get_itemp(gen);
+
+   /* mask with bit 31 set, the rest cleared */  
+   spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   /* d = sign bit cleared in s1 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_andc(gen->f, d_reg[ch], s1_reg[ch], bit31mask_reg);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit 3 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s1x_reg, s1y_reg, s1z_reg;
+   int s2x_reg, s2y_reg, s2z_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      spe_move(gen->f, d_reg, t0_reg);
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit 4 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s0x_reg, s0y_reg, s0z_reg, s0w_reg;
+   int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+   s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg);
+
+   /* t1 = w0 * w1 + t1 */
+   spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      spe_move(gen->f, d_reg, t0_reg);
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit homogeneous dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   /* XXX rewrite this function to look more like DP3/DP4 */
+   int ch;
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   /* t = w1 + t */
+   spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      spe_move(gen->f, d_reg, tmp_reg);
+      store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit 3-component vector normalize.
+ */
+static boolean
+emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int src_reg[3];
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   src_reg[0] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+
+   /* t0 = x * x */
+   spe_fm(gen->f, t0_reg, src_reg[0], src_reg[0]);
+
+   /* t1 = y * y */
+   spe_fm(gen->f, t1_reg, src_reg[1], src_reg[1]);
+
+   /* t0 = z * z + t0 */
+   spe_fma(gen->f, t0_reg, src_reg[2], src_reg[2], t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   /* t1 = 1.0 / sqrt(t0) */
+   spe_frsqest(gen->f, t1_reg, t0_reg);
+   spe_fi(gen->f, t1_reg, t0_reg, t1_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      /* dst = src[ch] * t1 */
+      spe_fm(gen->f, d_reg, src_reg[ch], t1_reg);
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit cross product.  See emit_ADD for comments.
+ */
+static boolean
+emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = z0 * y1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = y0 * z1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
+      store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = x0 * z1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = z0 * x1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = y0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = x0 * y1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit inequality instruction.
+ * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
+ * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
+ * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
+ */
+static boolean
+emit_inequality(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4], one_reg;
+   bool complement = FALSE;
+
+   one_reg = get_const_one_reg(gen);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SGT:
+         spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_SLT:
+         spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]);
+         break;
+      case TGSI_OPCODE_SGE:
+         spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]);
+         complement = TRUE;
+         break;
+      case TGSI_OPCODE_SLE:
+         spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         complement = TRUE;
+         break;
+      case TGSI_OPCODE_SEQ:
+         spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_SNE:
+         spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         complement = TRUE;
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      /* d = d & one_reg */
+      if (complement)
+         spe_andc(gen->f, d_reg[ch], one_reg, d_reg[ch]);
+      else
+         spe_and(gen->f, d_reg[ch], one_reg, d_reg[ch]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit compare.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      int zero_reg = get_itemp(gen);
+   
+      spe_zero(gen->f, zero_reg);
+
+      /* d = (s1 < 0) ? s2 : s3 */
+      spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+      spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      free_itemps(gen);
+   }
+
+   return TRUE;
+}
+
+/**
+ * Emit trunc.  
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   /* Convert float to int */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_cflts(gen->f, d_reg[ch], s1_reg[ch], 0);
+   }
+
+   /* Convert int to float */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_csflt(gen->f, d_reg[ch], d_reg[ch], 0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit floor.  
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg;
+
+   zero_reg = get_itemp(gen);
+   spe_zero(gen->f, zero_reg);
+   one_reg = get_const_one_reg(gen);
+   
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   /* If negative, subtract 1.0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   /* Convert float to int */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+   }
+
+   /* Convert int to float */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_csflt(gen->f, d_reg[ch], tmp_reg[ch], 0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Compute frac = Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg;
+
+   zero_reg = get_itemp(gen);
+   spe_zero(gen->f, zero_reg);
+   one_reg = get_const_one_reg(gen);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   /* If negative, subtract 1.0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   /* Convert float to int */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+   }
+
+   /* Convert int to float */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_csflt(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+   }
+
+   /* d = s1 - FLR(s1) */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   /* store result */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+   struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i;
+   for (i = 0; i < funcs->num; i++) {
+      printf("SPU func %u: %s at %u\n",
+             i, funcs->names[i], funcs->addrs[i]);
+   }
+}
+#endif
+
+
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+   const struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i, addr = 0;
+   for (i = 0; i < funcs->num; i++) {
+      if (strcmp(funcs->names[i], funcname) == 0) {
+         addr = funcs->addrs[i];
+      }
+   }
+   assert(addr && "spu function not found");
+   return addr / 4;  /* discard 2 least significant bits */
+}
+
+
+/**
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
+ */
+static boolean
+emit_function_call(struct codegen *gen,
+                   const struct tgsi_full_instruction *inst,
+                   char *funcname, uint num_args, boolean scalar)
+{
+   const uint addr = lookup_function(gen->cell, funcname);
+   char comment[100];
+   int s_regs[3];
+   int func_called = FALSE;
+   uint a, ch;
+   int retval_reg = -1;
+
+   assert(num_args <= 3);
+
+   snprintf(comment, sizeof(comment), "CALL %s:", funcname);
+   spe_comment(gen->f, -4, comment);
+
+   if (scalar) {
+      for (a = 0; a < num_args; a++) {
+         s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+      }
+      /* we'll call the function, put the return value in this register,
+       * then replicate it across all write-enabled components in d_reg.
+       */
+      retval_reg = spe_allocate_available_register(gen->f);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg;
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      if (!scalar) {
+         for (a = 0; a < num_args; a++) {
+            s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+         }
+      }
+
+      d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+      if (!scalar || !func_called) {
+         /* for a scalar function, we'll really only call the function once */
+
+         numUsed = spe_get_registers_used(gen->f, usedRegs);
+         assert(numUsed < gen->frame_size / 16 - 2);
+
+         /* save registers to stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            int offset = 2 + i;
+            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+
+         /* setup function arguments */
+         for (a = 0; a < num_args; a++) {
+            spe_move(gen->f, 3 + a, s_regs[a]);
+         }
+
+         /* branch to function, save return addr */
+         spe_brasl(gen->f, SPE_REG_RA, addr);
+
+         /* save function's return value */
+         if (scalar)
+            spe_move(gen->f, retval_reg, 3);
+         else
+            spe_move(gen->f, d_reg, 3);
+
+         /* restore registers from stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            if (reg != d_reg && reg != retval_reg) {
+               int offset = 2 + i;
+               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+         }
+
+         func_called = TRUE;
+      }
+
+      if (scalar) {
+         spe_move(gen->f, d_reg, retval_reg);
+      }
+
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      free_itemps(gen);
+   }
+
+   if (scalar) {
+      spe_release_register(gen->f, retval_reg);
+   }
+
+   return TRUE;
+}
+
+
+static boolean
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const uint target = inst->InstructionExtTexture.Texture;
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint addr;
+   int ch;
+   int coord_regs[4], d_regs[4];
+
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_2D:
+      addr = lookup_function(gen->cell, "spu_tex_2d");
+      break;
+   case TGSI_TEXTURE_3D:
+      addr = lookup_function(gen->cell, "spu_tex_3d");
+      break;
+   case TGSI_TEXTURE_CUBE:
+      addr = lookup_function(gen->cell, "spu_tex_cube");
+      break;
+   default:
+      ASSERT(0 && "unsupported texture target");
+      return FALSE;
+   }
+
+   assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+
+   spe_comment(gen->f, -4, "CALL tex:");
+
+   /* get src/dst reg info */
+   for (ch = 0; ch < 4; ch++) {
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   {
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      numUsed = spe_get_registers_used(gen->f, usedRegs);
+      assert(numUsed < gen->frame_size / 16 - 2);
+
+      /* save registers to stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         int offset = 2 + i;
+         spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+      }
+
+      /* setup function arguments (XXX depends on target) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, 3 + i, coord_regs[i]);
+      }
+      spe_load_uint(gen->f, 7, unit); /* sampler unit */
+
+      /* branch to function, save return addr */
+      spe_brasl(gen->f, SPE_REG_RA, addr);
+
+      /* save function's return values (four pixel's colors) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, d_regs[i], 3 + i);
+      }
+
+      /* restore registers from stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         if (reg != d_regs[0] &&
+             reg != d_regs[1] &&
+             reg != d_regs[2] &&
+             reg != d_regs[3]) {
+            int offset = 2 + i;
+            spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+      }
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+      free_itemps(gen);
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * KILL if any of src reg values are less than zero.
+ */
+static boolean
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
+
+   spe_comment(gen->f, -4, "CALL kil:");
+
+   /* zero = {0,0,0,0} */
+   zero_reg = get_itemp(gen);
+   spe_zero(gen->f, zero_reg);
+
+   cmp_reg = get_itemp(gen);
+
+   /* get src regs */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+   }
+
+   /* test if any src regs are < 0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (kil_reg >= 0) {
+         /* cmp = 0 > src ? : ~0 : 0 */
+         spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+         /* kil = kil | cmp */
+         spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+      }
+      else {
+         kil_reg = get_itemp(gen);
+         /* kil = 0 > src ? : ~0 : 0 */
+         spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+      }
+   }
+
+   if (gen->if_nesting || gen->loop_nesting) {
+      /* may have been a conditional kil */
+      spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+   }
+
+   /* allocate the kill mask reg if needed */
+   if (gen->kill_mask_reg <= 0) {
+      gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+      spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+   }
+   else {
+      spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+
+/**
+ * Emit min or max.
+ */
+static boolean
+emit_MIN_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);         
+   }
+
+   /* d = (s0 > s1) ? s0 : s1 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (inst->Instruction.Opcode == TGSI_OPCODE_MAX)
+         spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
+      else
+         spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit code to update the execution mask.
+ * This needs to be done whenever the execution status of a conditional
+ * or loop is changed.
+ */
+static void
+emit_update_exec_mask(struct codegen *gen)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+   const int cond_reg = gen->cond_mask_reg;
+   const int loop_reg = gen->loop_mask_reg;
+
+   spe_comment(gen->f, 0, "Update master execution mask");
+
+   if (gen->if_nesting > 0 && gen->loop_nesting > 0) {
+      /* exec_mask = cond_mask & loop_mask */
+      assert(cond_reg > 0);
+      assert(loop_reg > 0);
+      spe_and(gen->f, exec_reg, cond_reg, loop_reg);
+   }
+   else if (gen->if_nesting > 0) {
+      assert(cond_reg > 0);
+      spe_move(gen->f, exec_reg, cond_reg);
+   }
+   else if (gen->loop_nesting > 0) {
+      assert(loop_reg > 0);
+      spe_move(gen->f, exec_reg, loop_reg);
+   }
+   else {
+      spe_load_int(gen->f, exec_reg, ~0x0);
+   }
+}
+
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int channel = 0;
+   int cond_reg;
+
+   cond_reg = get_cond_mask_reg(gen);
+
+   /* XXX push cond exec mask */
+
+   spe_comment(gen->f,  0, "init conditional exec mask = ~0:");
+   spe_load_int(gen->f, cond_reg, ~0);
+
+   /* update conditional execution mask with the predicate register */
+   int tmp_reg = get_itemp(gen);
+   int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+   /* tmp = (s1_reg == 0) */
+   spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+   /* tmp = !tmp */
+   spe_complement(gen->f, tmp_reg, tmp_reg);
+   /* cond_mask = cond_mask & tmp */
+   spe_and(gen->f, cond_reg, cond_reg, tmp_reg);
+
+   gen->if_nesting++;
+
+   /* update the master execution mask */
+   emit_update_exec_mask(gen);
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int cond_reg = get_cond_mask_reg(gen);
+
+   spe_comment(gen->f, 0, "cond exec mask = !cond exec mask");
+   spe_complement(gen->f, cond_reg, cond_reg);
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   /* XXX todo: pop cond exec mask */
+
+   gen->if_nesting--;
+
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_BGNLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int exec_reg, loop_reg;
+
+   exec_reg = get_exec_mask_reg(gen);
+   loop_reg = get_loop_mask_reg(gen);
+
+   /* XXX push loop_exec mask */
+
+   spe_comment(gen->f,  0*-4, "initialize loop exec mask = ~0");
+   spe_load_int(gen->f, loop_reg, ~0x0);
+
+   gen->loop_nesting++;
+   gen->loop_start = spe_code_size(gen->f);  /* in bytes */
+
+   return TRUE;
+}
+
+
+static boolean
+emit_ENDLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int loop_reg = get_loop_mask_reg(gen);
+   const int tmp_reg = get_itemp(gen);
+   int offset;
+
+   /* tmp_reg = exec[0] | exec[1] | exec[2] | exec[3] */
+   spe_orx(gen->f, tmp_reg, loop_reg);
+
+   offset = gen->loop_start - spe_code_size(gen->f); /* in bytes */
+
+   /* branch back to top of loop if tmp_reg != 0 */
+   spe_brnz(gen->f, tmp_reg, offset / 4);
+
+   /* XXX pop loop_exec mask */
+
+   gen->loop_nesting--;
+
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_BRK(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+   const int loop_reg = get_loop_mask_reg(gen);
+
+   assert(gen->loop_nesting > 0);
+
+   spe_comment(gen->f, 0, "loop exec mask &= ~master exec mask");
+   spe_andc(gen->f, loop_reg, loop_reg, exec_reg);
+
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_CONT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   assert(gen->loop_nesting > 0);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+             boolean ddx)
+{
+   int ch;
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+      int t1_reg = get_itemp(gen);
+      int t2_reg = get_itemp(gen);
+
+      spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+      if (ddx) {
+         spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+      }
+      else {
+         spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+      }
+      spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+      free_itemps(gen);
+   }
+
+   return TRUE;
+}
+
+
+
+
+/**
+ * Emit END instruction.
+ * We just return from the shader function at this point.
+ *
+ * Note that there may be more code after this that would be
+ * called by TGSI_OPCODE_CALL.
+ */
+static boolean
+emit_END(struct codegen *gen)
+{
+   emit_epilogue(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit code for the given instruction.  Just a big switch stmt.
+ */
+static boolean
+emit_instruction(struct codegen *gen,
+                 const struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      return emit_ARL(gen, inst);
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      return emit_MOV(gen, inst);
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+      return emit_binop(gen, inst);
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(gen, inst);
+   case TGSI_OPCODE_LERP:
+      return emit_LERP(gen, inst);
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(gen, inst);
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(gen, inst);
+   case TGSI_OPCODE_DPH:
+      return emit_DPH(gen, inst);
+   case TGSI_OPCODE_NRM:
+      return emit_NRM3(gen, inst);
+   case TGSI_OPCODE_XPD:
+      return emit_XPD(gen, inst);
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+      return emit_RCP_RSQ(gen, inst);
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(gen, inst);
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+      return emit_inequality(gen, inst);
+   case TGSI_OPCODE_CMP:
+      return emit_CMP(gen, inst);
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      return emit_MIN_MAX(gen, inst);
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(gen, inst);
+   case TGSI_OPCODE_FLR:
+      return emit_FLR(gen, inst);
+   case TGSI_OPCODE_FRC:
+      return emit_FRC(gen, inst);
+   case TGSI_OPCODE_END:
+      return emit_END(gen);
+
+   case TGSI_OPCODE_COS:
+      return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
+   case TGSI_OPCODE_SIN:
+      return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
+   case TGSI_OPCODE_POW:
+      return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
+   case TGSI_OPCODE_TEX:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXD:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXB:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXL:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXP:
+      return emit_TEX(gen, inst);
+   case TGSI_OPCODE_KIL:
+      return emit_KIL(gen, inst);
+
+   case TGSI_OPCODE_IF:
+      return emit_IF(gen, inst);
+   case TGSI_OPCODE_ELSE:
+      return emit_ELSE(gen, inst);
+   case TGSI_OPCODE_ENDIF:
+      return emit_ENDIF(gen, inst);
+
+   case TGSI_OPCODE_BGNLOOP2:
+      return emit_BGNLOOP(gen, inst);
+   case TGSI_OPCODE_ENDLOOP2:
+      return emit_ENDLOOP(gen, inst);
+   case TGSI_OPCODE_BRK:
+      return emit_BRK(gen, inst);
+   case TGSI_OPCODE_CONT:
+      return emit_CONT(gen, inst);
+
+   case TGSI_OPCODE_DDX:
+      return emit_DDX_DDY(gen, inst, TRUE);
+   case TGSI_OPCODE_DDY:
+      return emit_DDX_DDY(gen, inst, FALSE);
+
+   /* XXX lots more cases to do... */
+
+   default:
+      fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+              inst->Instruction.Opcode);
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+
+/**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+   int ch;
+
+   assert(gen->num_imm < MAX_TEMPS);
+
+   for (ch = 0; ch < 4; ch++) {
+      float val = immed->u.ImmediateFloat32[ch].Float;
+
+      if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+         /* re-use previous register */
+         gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+      }
+      else {
+         char str[100];
+         int reg = spe_allocate_available_register(gen->f);
+
+         if (reg < 0)
+            return FALSE;
+
+         sprintf(str, "init $%d = %f", reg, val);
+         spe_comment(gen->f, 0, str);
+
+         /* update immediate map */
+         gen->imm_regs[gen->num_imm][ch] = reg;
+
+         /* emit initializer instruction */
+         spe_load_float(gen->f, reg, val);
+      }
+   }
+
+   gen->num_imm++;
+
+   return TRUE;
+}
+
+
+
+/**
+ * Emit "code" for a TGSI declaration.
+ * We only care about TGSI TEMPORARY register declarations at this time.
+ * For each TGSI TEMPORARY we allocate four SPE registers.
+ */
+static boolean
+emit_declaration(struct cell_context *cell,
+                 struct codegen *gen, const struct tgsi_full_declaration *decl)
+{
+   int i, ch;
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last;
+           i++) {
+         assert(i < MAX_TEMPS);
+         for (ch = 0; ch < 4; ch++) {
+            gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+            if (gen->temp_regs[i][ch] < 0)
+               return FALSE; /* out of regs */
+         }
+
+         /* XXX if we run out of SPE registers, we need to spill
+          * to SPU memory.  someday...
+          */
+
+         {
+            char buf[100];
+            sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+                    gen->temp_regs[i][0], gen->temp_regs[i][1],
+                    gen->temp_regs[i][2], gen->temp_regs[i][3]);
+            spe_comment(gen->f, 0, buf);
+         }
+      }
+      break;
+   default:
+      ; /* ignore */
+   }
+
+   return TRUE;
+}
+
+
+
+/**
+ * Translate TGSI shader code to SPE instructions.  This is done when
+ * the state tracker gives us a new shader (via pipe->create_fs_state()).
+ *
+ * \param cell    the rendering context (in)
+ * \param tokens  the TGSI shader (in)
+ * \param f       the generated function (out)
+ */
+boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f)
+{
+   struct tgsi_parse_context parse;
+   struct codegen gen;
+   uint ic = 0;
+
+   memset(&gen, 0, sizeof(gen));
+   gen.cell = cell;
+   gen.f = f;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   gen.inputs_reg = 3;     /* pointer to inputs array */
+   gen.outputs_reg = 4;    /* pointer to outputs array */
+   gen.constants_reg = 5;  /* pointer to constants array */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, gen.inputs_reg);
+   spe_allocate_register(f, gen.outputs_reg);
+   spe_allocate_register(f, gen.constants_reg);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, TRUE);
+      spe_indent(f, 2*8);
+      printf("Begin %s\n", __FUNCTION__);
+      tgsi_dump(tokens, 0);
+   }
+
+   tgsi_parse_init(&parse, tokens);
+
+   emit_prologue(&gen);
+
+   while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         if (f->print) {
+            _debug_printf("    # ");
+            tgsi_dump_immediate(&parse.FullToken.FullImmediate);
+         }
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+            gen.error = TRUE;
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (f->print) {
+            _debug_printf("    # ");
+            tgsi_dump_declaration(&parse.FullToken.FullDeclaration);
+         }
+         if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+            gen.error = TRUE;
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (f->print) {
+            _debug_printf("    # ");
+            ic++;
+            tgsi_dump_instruction(&parse.FullToken.FullInstruction, ic);
+         }
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
+            gen.error = TRUE;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   if (gen.error) {
+      /* terminate the SPE code */
+      return emit_END(&gen);
+   }
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+      printf("End %s\n", __FUNCTION__);
+   }
+
+   tgsi_parse_free( &parse );
+
+   return !gen.error;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
new file mode 100644
index 0000000000..99faea7046
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef CELL_GEN_FP_H
+#define CELL_GEN_FP_H
+
+
+
+extern boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f);
+
+
+#endif /* CELL_GEN_FP_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
new file mode 100644
index 0000000000..66d4b3b6a3
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -0,0 +1,2181 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Generate SPU per-fragment code (actually per-quad code).
+ * \author Brian Paul
+ * \author Bob Ellison
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+
+
+
+/** Do extra optimizations? */
+#define OPTIMIZATIONS 1
+
+
+/**
+ * Generate SPE code to perform Z/depth testing.
+ *
+ * \param dsa         Gallium depth/stencil/alpha state to gen code for
+ * \param f           SPE function to append instruction onto.
+ * \param mask_reg    register containing quad/pixel "alive" mask (in/out)
+ * \param ifragZ_reg  register containing integer fragment Z values (in)
+ * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
+ * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns TRUE if the Z-buffer needs to be updated.
+ */
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
+               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
+{
+   /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+    * quantities.  This only makes a difference for 32-bit Z values though.
+    */
+   ASSERT(dsa->depth.enabled);
+
+   switch (dsa->depth.func) {
+   case PIPE_FUNC_EQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* zmask = (ifragZ > ref) */
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* zmask = (ref > ifragZ) */
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* zmask = (ifragZ > ref) */
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* zmask = (ref > ifragZ) */
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */
+      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* mask unchanged */
+      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+   if (dsa->depth.writemask) {
+      /*
+       * If (ztest passed) {
+       *    framebufferZ = fragmentZ;
+       * }
+       * OR,
+       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
+       */
+      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return TRUE;
+   }
+
+   return FALSE;
+}
+
+
+/**
+ * Generate SPE code to perform alpha testing.
+ *
+ * \param dsa        Gallium depth/stencil/alpha state to gen code for
+ * \param f          SPE function to append instruction onto.
+ * \param mask_reg   register containing quad/pixel "alive" mask (in/out)
+ * \param fragA_reg  register containing four fragment alpha values (in)
+ */
+static void
+gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f, int mask_reg, int fragA_reg)
+{
+   int ref_reg = spe_allocate_available_register(f);
+   int amask_reg = spe_allocate_available_register(f);
+
+   ASSERT(dsa->alpha.enabled);
+
+   if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
+       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      /* load/splat the alpha reference float value */
+      spe_load_float(f, ref_reg, dsa->alpha.ref_value);
+   }
+
+   /* emit code to do the alpha comparison, updating 'mask' */
+   switch (dsa->alpha.func) {
+   case PIPE_FUNC_EQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* no-op, mask unchanged */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+#if OPTIMIZATIONS
+   /* if mask == {0,0,0,0} we're all done, return */
+   {
+      /* re-use amask reg here */
+      int tmp_reg = amask_reg;
+      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
+      spe_orx(f, tmp_reg, mask_reg);
+      /* if tmp[0] == 0 then return from function call */
+      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
+   }
+#endif
+
+   spe_release_register(f, ref_reg);
+   spe_release_register(f, amask_reg);
+}
+
+
+/**
+ * This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static INLINE void
+setup_optional_register(struct spe_function *f,
+                        int *r)
+{
+   if (*r < 0)
+      *r = spe_allocate_available_register(f);
+}
+
+static INLINE void
+release_optional_register(struct spe_function *f,
+                          int r)
+{
+   if (r >= 0)
+      spe_release_register(f, r);
+}
+
+static INLINE void
+setup_const_register(struct spe_function *f,
+                     int *r,
+                     float value)
+{
+   if (*r >= 0)
+      return;
+   setup_optional_register(f, r);
+   spe_load_float(f, *r, value);
+}
+
+static INLINE void
+release_const_register(struct spe_function *f,
+                       int r)
+{
+   release_optional_register(f, r);
+}
+
+
+
+/**
+ * Unpack/convert framebuffer colors from four 32-bit packed colors
+ * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+ * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+ */
+static void
+unpack_colors(struct spe_function *f,
+              enum pipe_format color_format,
+              int fbRGBA_reg,
+              int fbR_reg, int fbG_reg, int fbB_reg, int fbA_reg)
+{
+   int mask0_reg = spe_allocate_available_register(f);
+   int mask1_reg = spe_allocate_available_register(f);
+   int mask2_reg = spe_allocate_available_register(f);
+   int mask3_reg = spe_allocate_available_register(f);
+
+   spe_load_int(f, mask0_reg, 0xff);
+   spe_load_int(f, mask1_reg, 0xff00);
+   spe_load_int(f, mask2_reg, 0xff0000);
+   spe_load_int(f, mask3_reg, 0xff000000);
+
+   spe_comment(f, 0, "Unpack framebuffer colors, convert to floats");
+
+   switch (color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      /* fbB = fbRGBA & mask */
+      spe_and(f, fbB_reg, fbRGBA_reg, mask0_reg);
+
+      /* fbG = fbRGBA & mask */
+      spe_and(f, fbG_reg, fbRGBA_reg, mask1_reg);
+
+      /* fbR = fbRGBA & mask */
+      spe_and(f, fbR_reg, fbRGBA_reg, mask2_reg);
+
+      /* fbA = fbRGBA & mask */
+      spe_and(f, fbA_reg, fbRGBA_reg, mask3_reg);
+
+      /* fbG = fbG >> 8 */
+      spe_roti(f, fbG_reg, fbG_reg, -8);
+
+      /* fbR = fbR >> 16 */
+      spe_roti(f, fbR_reg, fbR_reg, -16);
+
+      /* fbA = fbA >> 24 */
+      spe_roti(f, fbA_reg, fbA_reg, -24);
+      break;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      /* fbA = fbRGBA & mask */
+      spe_and(f, fbA_reg, fbRGBA_reg, mask0_reg);
+
+      /* fbR = fbRGBA & mask */
+      spe_and(f, fbR_reg, fbRGBA_reg, mask1_reg);
+
+      /* fbG = fbRGBA & mask */
+      spe_and(f, fbG_reg, fbRGBA_reg, mask2_reg);
+
+      /* fbB = fbRGBA & mask */
+      spe_and(f, fbB_reg, fbRGBA_reg, mask3_reg);
+
+      /* fbR = fbR >> 8 */
+      spe_roti(f, fbR_reg, fbR_reg, -8);
+
+      /* fbG = fbG >> 16 */
+      spe_roti(f, fbG_reg, fbG_reg, -16);
+
+      /* fbB = fbB >> 24 */
+      spe_roti(f, fbB_reg, fbB_reg, -24);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+
+   /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+   spe_cuflt(f, fbR_reg, fbR_reg, 8);
+   spe_cuflt(f, fbG_reg, fbG_reg, 8);
+   spe_cuflt(f, fbB_reg, fbB_reg, 8);
+   spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+   spe_release_register(f, mask0_reg);
+   spe_release_register(f, mask1_reg);
+   spe_release_register(f, mask2_reg);
+   spe_release_register(f, mask3_reg);
+}
+
+
+/**
+ * Generate SPE code to implement the given blend mode for a quad of pixels.
+ * \param f          SPE function to append instruction onto.
+ * \param fragR_reg  register with fragment red values (float) (in/out)
+ * \param fragG_reg  register with fragment green values (float) (in/out)
+ * \param fragB_reg  register with fragment blue values (float) (in/out)
+ * \param fragA_reg  register with fragment alpha values (float) (in/out)
+ * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
+ */
+static void
+gen_blend(const struct pipe_blend_state *blend,
+          const struct pipe_blend_color *blend_color,
+          struct spe_function *f,
+          enum pipe_format color_format,
+          int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
+          int fbRGBA_reg)
+{
+   int term1R_reg = spe_allocate_available_register(f);
+   int term1G_reg = spe_allocate_available_register(f);
+   int term1B_reg = spe_allocate_available_register(f);
+   int term1A_reg = spe_allocate_available_register(f);
+
+   int term2R_reg = spe_allocate_available_register(f);
+   int term2G_reg = spe_allocate_available_register(f);
+   int term2B_reg = spe_allocate_available_register(f);
+   int term2A_reg = spe_allocate_available_register(f);
+
+   int fbR_reg = spe_allocate_available_register(f);
+   int fbG_reg = spe_allocate_available_register(f);
+   int fbB_reg = spe_allocate_available_register(f);
+   int fbA_reg = spe_allocate_available_register(f);
+
+   int tmp_reg = spe_allocate_available_register(f);
+
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
+    */
+   int one_reg = -1;
+   int constR_reg = -1, constG_reg = -1, constB_reg = -1, constA_reg = -1;
+
+   ASSERT(blend->blend_enable);
+
+   /* packed RGBA -> float colors */
+   unpack_colors(f, color_format, fbRGBA_reg,
+                 fbR_reg, fbG_reg, fbB_reg, fbA_reg);
+
+   /*
+    * Compute Src RGB terms.  We're actually looking for the value
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
+    */
+   switch (blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (R,G,B) */
+      spe_move(f, term1R_reg, fragR_reg);
+      spe_move(f, term1G_reg, fragG_reg);
+      spe_move(f, term1B_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factors = (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term1R_reg, 0.0f);
+      spe_load_float(f, term1G_reg, 0.0f);
+      spe_load_float(f, term1B_reg, 0.0f);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
+      spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
+      spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+      spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+      spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* We'll need the optional {1,1,1,1} register */
+      setup_const_register(f, &one_reg, 1.0f);
+      /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
+       * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
+       */
+      /* tmp = 1 - Afb */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* tmp = min(A,tmp) */
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+      /* term = R*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
+   case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
+      spe_move(f, term1A_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
+      spe_move(f, term2R_reg, fbR_reg);
+      spe_move(f, term2G_reg, fbG_reg);
+      spe_move(f, term2B_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
+      spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
+      spe_move(f, term2A_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
+      spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest RGB terms as per the blend equation.
+    */
+   switch (blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+      spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+      spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest A term
+    */
+   switch (blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   spe_release_register(f, term1R_reg);
+   spe_release_register(f, term1G_reg);
+   spe_release_register(f, term1B_reg);
+   spe_release_register(f, term1A_reg);
+
+   spe_release_register(f, term2R_reg);
+   spe_release_register(f, term2G_reg);
+   spe_release_register(f, term2B_reg);
+   spe_release_register(f, term2A_reg);
+
+   spe_release_register(f, fbR_reg);
+   spe_release_register(f, fbG_reg);
+   spe_release_register(f, fbB_reg);
+   spe_release_register(f, fbA_reg);
+
+   spe_release_register(f, tmp_reg);
+
+   /* Free any optional registers that actually got used */
+   release_const_register(f, one_reg);
+   release_const_register(f, constR_reg);
+   release_const_register(f, constG_reg);
+   release_const_register(f, constB_reg);
+   release_const_register(f, constA_reg);
+}
+
+
+static void
+gen_logicop(const struct pipe_blend_state *blend,
+            struct spe_function *f,
+            int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
+}
+
+
+/**
+ * Generate code to pack a quad of float colors into four 32-bit integers.
+ *
+ * \param f             SPE function to append instruction onto.
+ * \param color_format  the dest color packing format
+ * \param r_reg         register containing four red values (in/clobbered)
+ * \param g_reg         register containing four green values (in/clobbered)
+ * \param b_reg         register containing four blue values (in/clobbered)
+ * \param a_reg         register containing four alpha values (in/clobbered)
+ * \param rgba_reg      register to store the packed RGBA colors (out)
+ */
+static void
+gen_pack_colors(struct spe_function *f,
+                enum pipe_format color_format,
+                int r_reg, int g_reg, int b_reg, int a_reg,
+                int rgba_reg)
+{
+   int rg_reg = spe_allocate_available_register(f);
+   int ba_reg = spe_allocate_available_register(f);
+
+   /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+   spe_cfltu(f, r_reg, r_reg, 32);
+   spe_cfltu(f, g_reg, g_reg, 32);
+   spe_cfltu(f, b_reg, b_reg, 32);
+   spe_cfltu(f, a_reg, a_reg, 32);
+
+   /* Shift the most significant bytes to the least significant positions.
+    * I.e.: reg = reg >> 24
+    */
+   spe_rotmi(f, r_reg, r_reg, -24);
+   spe_rotmi(f, g_reg, g_reg, -24);
+   spe_rotmi(f, b_reg, b_reg, -24);
+   spe_rotmi(f, a_reg, a_reg, -24);
+
+   /* Shift the color bytes according to the surface format */
+   if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+      spe_roti(f, g_reg, g_reg, 8);   /* green <<= 8 */
+      spe_roti(f, r_reg, r_reg, 16);  /* red <<= 16 */
+      spe_roti(f, a_reg, a_reg, 24);  /* alpha <<= 24 */
+   }
+   else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+      spe_roti(f, r_reg, r_reg, 8);   /* red <<= 8 */
+      spe_roti(f, g_reg, g_reg, 16);  /* green <<= 16 */
+      spe_roti(f, b_reg, b_reg, 24);  /* blue <<= 24 */
+   }
+   else {
+      ASSERT(0);
+   }
+
+   /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+    * Eg: after shifting according to color_format we might have:
+    *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+    *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+    *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+    *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+    * OR-ing all those together gives us four packed colors:
+    *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+    */
+   spe_or(f, rg_reg, r_reg, g_reg);
+   spe_or(f, ba_reg, a_reg, b_reg);
+   spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+   spe_release_register(f, rg_reg);
+   spe_release_register(f, ba_reg);
+}
+
+
+static void
+gen_colormask(struct spe_function *f,
+              uint colormask,
+              enum pipe_format color_format,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.  Further, the pixels
+    * are packed according to the given color format, not
+    * necessarily RGBA...
+    */
+   uint r_mask;
+   uint g_mask;
+   uint b_mask;
+   uint a_mask;
+
+   /* Calculate exactly where the bits for any particular color
+    * end up, so we can mask them correctly.
+    */
+   switch(color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* ARGB */
+         a_mask = 0xff000000;
+         r_mask = 0x00ff0000;
+         g_mask = 0x0000ff00;
+         b_mask = 0x000000ff;
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* BGRA */
+         b_mask = 0xff000000;
+         g_mask = 0x00ff0000;
+         r_mask = 0x0000ff00;
+         a_mask = 0x000000ff;
+         break;
+      default:
+         ASSERT(0);
+   }
+
+   /* For each R, G, B, and A component we're supposed to mask out, 
+    * clear its bits.   Then our mask operation later will work 
+    * as expected.
+    */
+   if (!(colormask & PIPE_MASK_R)) {
+      r_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_G)) {
+      g_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_B)) {
+      b_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_A)) {
+      a_mask = 0;
+   }
+
+   /* Get a temporary register to hold the mask that will be applied
+    * to the fragment
+    */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* The actual mask we're going to use is an OR of the remaining R, G, B,
+    * and A masks.  Load the result value into our temporary register.
+    */
+   spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
+
+
+/**
+ * This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in
+ * (fragment_mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f,
+                 const struct pipe_stencil_state *state, 
+                 uint stencil_max_value,
+                 int fragment_mask_reg,
+                 int fbS_reg, 
+                 int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the
+    * stencil_pass_reg register, taking into account whether each fragment
+    * was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+         uint tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                state->valuemask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & ~(s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                state->valuemask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LESS:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference < s)  */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                  state->valuemask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GREATER:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference > s) */
+         /* There's no convenient Compare Less Than Immediate instruction, so
+          * we'll have to do this one the harder way, by loading a register and 
+          * comparing directly.  Compare Logical Greater Than Word (clgt) 
+          * treats its operands as unsigned - no sign extension.
+          */
+         int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+         int tmp_reg = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->valuemask & state->ref_value);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference >= s) 
+          *              = fragment_mask & ~(s > reference) */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg,
+                                  state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                  state->valuemask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference <= s) ]
+          *               = fragment_mask & ~(reference > s) */
+         /* As above, we have to do this by loading a register */
+         int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+         int tmp_reg = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value & state->valuemask);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = fragment_mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = fragment_mask & 1 = fragment_mask */
+      spe_move(f, stencil_pass_reg, fragment_mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
+    */
+}
+
+
+/**
+ * This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f,
+                   uint stencil_op,
+                   uint stencil_ref_value,
+                   uint stencil_max_value,
+                   int fbS_reg,
+                   int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/**
+ * This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f,
+                       const struct pipe_stencil_state *stencil,
+                       const uint depth_enabled,
+                       int fbS_reg, 
+                       int *fail_reg,
+                       int *zfail_reg, 
+                       int *zpass_reg)
+{
+   uint zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(stencil->enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes, which means that the zfail_op is not considered - a
+    * failing stencil test triggers the fail_op, and a passing one
+    * triggers the zpass_op
+    *
+    * As an optimization, override calculation of the zfail_op values
+    * if they aren't going to be used.  By setting the value of
+    * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+    * to match the incoming stencil values, and no calculation will
+    * be done.
+    */
+   if (depth_enabled) {
+      zfail_op = stencil->zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->fail_op, stencil->ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   /* Check the possibly overridden value, not the structure value */
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == stencil->fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (stencil->zpass_op == stencil->fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (stencil->zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+}
+
+/**
+ * Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled.  This function must not use
+ * the register if depth is not enabled.
+ */
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const uint facing,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = FALSE;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   struct pipe_stencil_state *stencil;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   int stencil_pass_reg, stencil_fail_reg;
+   int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   int stencil_writemask_reg;
+   int zmask_reg;
+   int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_comment(f, 0, "Allocating stencil register set");
+   spe_allocate_register_set(f);
+
+   /* The facing we're given is the fragment facing; it doesn't
+    * exactly match the stencil facing.  If stencil is enabled,
+    * but two-sided stencil is *not* enabled, we use the same
+    * stencil settings for both front- and back-facing fragments.
+    * We only use the "back-facing" stencil for backfacing fragments
+    * if two-sided stenciling is enabled.
+    */
+   if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+      stencil = &dsa->stencil[1];
+   }
+   else {
+      stencil = &dsa->stencil[0];
+   }
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
+    */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+       need_to_calculate_stencil_values = FALSE;
+       need_to_writemask_stencil_values = FALSE;
+    }
+    else if (stencil->writemask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
+       */
+      need_to_calculate_stencil_values = FALSE;
+      need_to_writemask_stencil_values = FALSE;
+   }
+   else if (stencil->writemask == 0xff) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = TRUE;
+      need_to_writemask_stencil_values = FALSE;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = TRUE;
+      need_to_writemask_stencil_values = TRUE;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      spe_comment(f, 0, "Computing stencil writemask");
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].writemask);
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   spe_comment(f, 0, "Running basic stencil test");
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      spe_comment(f, 0, facing == CELL_FACING_FRONT
+                  ? "Computing front-facing stencil values"
+                  : "Computing back-facing stencil values");
+      gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, 
+         &stencil_fail_values, &stencil_pass_depth_fail_values, 
+         &stencil_pass_depth_pass_values);
+   }  
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      spe_comment(f, 0, "Running stencil depth test");
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+                                         fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Saving current stencil values for writemasking");
+         spe_move(f, newS_reg, fbS_reg);
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_comment(f, 0, "Loading stencil fail values");
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+         modified_buffers = TRUE;
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         uint stencil_pass_depth_fail_mask =
+            spe_allocate_available_register(f);
+
+         spe_comment(f, 0, "Loading stencil pass/depth fail values");
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values,
+                  stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+         modified_buffers = TRUE;
+      }
+
+      /* Same for the stencil pass/depth pass mask.  Note that we
+       * *can* get here with zmask_reg being unset (if the depth
+       * test is off but the stencil test is on).  In this case,
+       * we assume the depth test passes, and don't need to mask
+       * the stencil pass mask with the Z mask.
+       */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         if (dsa->depth.enabled) {
+            uint stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+            /* We'll need a separate register */
+            spe_comment(f, 0, "Loading stencil pass/depth pass values");
+            spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+            spe_release_register(f, stencil_pass_depth_pass_mask);
+         }
+         else {
+            /* We can use the same stencil-pass register */
+            spe_comment(f, 0, "Loading stencil pass values");
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+         }
+         modified_buffers = TRUE;
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values && modified_buffers) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_comment(f, 0, "Writemasking new stencil values");
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_comment(f, 0, "Releasing stencil register set");
+   spe_release_register_set(f);
+
+   /* Return TRUE if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
+/**
+ * Generate depth and/or stencil test code.
+ * \param cell  context
+ * \param dsa  depth/stencil/alpha state
+ * \param f  spe function to emit
+ * \param facing  either CELL_FACING_FRONT or CELL_FACING_BACK
+ * \param mask_reg  register containing the pixel alive/dead mask
+ * \param depth_tile_reg  register containing address of z/stencil tile
+ * \param quad_offset_reg  offset to quad from start of tile
+ * \param fragZ_reg  register containg fragment Z values
+ */
+static void
+gen_depth_stencil(struct cell_context *cell,
+                  const struct pipe_depth_stencil_alpha_state *dsa,
+                  struct spe_function *f,
+                  uint facing,
+                  int mask_reg,
+                  int depth_tile_reg,
+                  int quad_offset_reg,
+                  int fragZ_reg)
+
+{
+   const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+   boolean write_depth_stencil;
+
+   /* framebuffer's combined z/stencil values register */
+   int fbZS_reg = spe_allocate_available_register(f);
+
+   /* Framebufer Z values register */
+   int fbZ_reg = spe_allocate_available_register(f);
+
+   /* Framebuffer stencil values register (may not be used) */
+   int fbS_reg = spe_allocate_available_register(f);
+
+   /* 24-bit mask register (may not be used) */
+   int zmask_reg = spe_allocate_available_register(f);
+
+   /**
+    * The following code:
+    * 1. fetch quad of packed Z/S values from the framebuffer tile.
+    * 2. extract the separate the Z and S values from packed values
+    * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints
+    *
+    * The instructions for doing this are interleaved for better performance.
+    */
+   spe_comment(f, 0, "Fetch Z/stencil quad from tile");
+
+   switch(zs_format) {
+   case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /* prepare mask to extract Z vals from ZS vals */
+      spe_load_uint(f, zmask_reg, 0x00ffffff);
+
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* right shift 32-bit fragment Z to 24 bits */
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+      /* extract 24-bit Z values from ZS values by masking */
+      spe_and(f, fbZ_reg, fbZS_reg, zmask_reg);
+
+      /* extract 8-bit stencil values by shifting */
+      spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+      break;
+
+   case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+   case PIPE_FORMAT_Z24X8_UNORM:
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* right shift 32-bit fragment Z to 24 bits */
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+      /* extract 24-bit Z values from ZS values by shifting */
+      spe_rotmi(f, fbZ_reg, fbZS_reg, -8);
+
+      /* extract 8-bit stencil values by masking */
+      spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+      break;
+
+   case PIPE_FORMAT_Z32_UNORM:
+      /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg);
+
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* No stencil, so can't do anything there */
+      break;
+
+   case PIPE_FORMAT_Z16_UNORM:
+      /* XXX This code for 16bpp Z is broken! */
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* Copy over 4 32-bit values */
+      spe_move(f, fbZ_reg, fbZS_reg);
+
+      /* convert Z from [0,1] to 16-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+      /* No stencil */
+      break;
+
+   default:
+      ASSERT(0); /* invalid format */
+   }
+
+   /* If stencil is enabled, use the stencil-specific code
+    * generator to generate both the stencil and depth (if needed)
+    * tests.  Otherwise, if only depth is enabled, generate
+    * a quick depth test.  The test generators themselves will
+    * report back whether the depth/stencil buffer has to be
+    * written back.
+    */
+   if (dsa->stencil[0].enabled) {
+      /* This will perform the stencil and depth tests, and update
+       * the mask_reg, fbZ_reg, and fbS_reg as required by the
+       * tests.
+       */
+      ASSERT(fbS_reg >= 0);
+      spe_comment(f, 0, "Perform stencil test");
+
+      /* Note that fbZ_reg may not be set on entry, if stenciling
+       * is enabled but there's no Z-buffer.  The 
+       * gen_stencil_depth_test() function must ignore the
+       * fbZ_reg register if depth is not enabled.
+       */
+      write_depth_stencil = gen_stencil_depth_test(f, dsa, facing,
+                                                   mask_reg, fragZ_reg,
+                                                   fbZ_reg, fbS_reg);
+   }
+   else if (dsa->depth.enabled) {
+      int zmask_reg = spe_allocate_available_register(f);
+      ASSERT(fbZ_reg >= 0);
+      spe_comment(f, 0, "Perform depth test");
+      write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+                                           fbZ_reg, zmask_reg);
+      spe_release_register(f, zmask_reg);
+   }
+   else {
+      write_depth_stencil = FALSE;
+   }
+
+   if (write_depth_stencil) {
+      /* Merge latest Z and Stencil values into fbZS_reg.
+       * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+       * fbS_reg has four 8-bit Z values in bits [7..0].
+       */
+      spe_comment(f, 0, "Store quad's depth/stencil values in tile");
+      if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+          zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+         spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+         spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+               zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+         spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+         spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+         spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+         spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+         ASSERT(0);   /* XXX to do */
+      }
+      else {
+         ASSERT(0); /* bad zs_format */
+      }
+
+      /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+      spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+   }
+
+   /* Don't need these any more */
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, fbZ_reg);
+   spe_release_register(f, fbS_reg);
+   spe_release_register(f, zmask_reg);
+}
+
+
+
+/**
+ * Generate SPE code to implement the fragment operations (alpha test,
+ * depth test, stencil test, blending, colormask, and final
+ * framebuffer write) as specified by the current context state.
+ *
+ * Logically, this code will be called after running the fragment
+ * shader.  But under some circumstances we could run some of this
+ * code before the fragment shader to cull fragments/quads that are
+ * totally occluded/discarded.
+ *
+ * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
+ *
+ * See the spu_default_fragment_ops() function to see how the per-fragment
+ * operations would be done with ordinary C code.
+ * The code we generate here though has no branches, is SIMD, etc and
+ * should be much faster.
+ *
+ * \param cell  the rendering context (in)
+ * \param facing whether the generated code is for front-facing or 
+ *              back-facing fragments
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
+ */
+void
+cell_gen_fragment_function(struct cell_context *cell,
+                           const uint facing,
+                           struct spe_function *f)
+{
+   const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+   const struct pipe_blend_state *blend = cell->blend;
+   const struct pipe_blend_color *blend_color = &cell->blend_color;
+   const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   const int x_reg = 3;  /* uint */
+   const int y_reg = 4;  /* uint */
+   const int color_tile_reg = 5;  /* tile_t * */
+   const int depth_tile_reg = 6;  /* tile_t * */
+   const int fragZ_reg = 7;   /* vector float */
+   const int fragR_reg = 8;   /* vector float */
+   const int fragG_reg = 9;   /* vector float */
+   const int fragB_reg = 10;  /* vector float */
+   const int fragA_reg = 11;  /* vector float */
+   const int mask_reg = 12;   /* vector uint */
+
+   ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
+
+   /* offset of quad from start of tile
+    * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
+    */
+   int quad_offset_reg;
+
+   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, TRUE);
+      spe_indent(f, 8);
+      spe_comment(f, -4, facing == CELL_FACING_FRONT
+                  ? "Begin front-facing per-fragment ops"
+                  : "Begin back-facing per-fragment ops");
+   }
+
+   spe_allocate_register(f, x_reg);
+   spe_allocate_register(f, y_reg);
+   spe_allocate_register(f, color_tile_reg);
+   spe_allocate_register(f, depth_tile_reg);
+   spe_allocate_register(f, fragZ_reg);
+   spe_allocate_register(f, fragR_reg);
+   spe_allocate_register(f, fragG_reg);
+   spe_allocate_register(f, fragB_reg);
+   spe_allocate_register(f, fragA_reg);
+   spe_allocate_register(f, mask_reg);
+
+   quad_offset_reg = spe_allocate_available_register(f);
+   fbRGBA_reg = spe_allocate_available_register(f);
+
+   /* compute offset of quad from start of tile, in bytes */
+   {
+      int x2_reg = spe_allocate_available_register(f);
+      int y2_reg = spe_allocate_available_register(f);
+
+      ASSERT(TILE_SIZE == 32);
+
+      spe_comment(f, 0, "Compute quad offset within tile");
+      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
+      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
+      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
+
+      spe_release_register(f, x2_reg);
+      spe_release_register(f, y2_reg);
+   }
+
+   /* Generate the alpha test, if needed. */
+   if (dsa->alpha.enabled) {
+      gen_alpha_test(dsa, f, mask_reg, fragA_reg);
+   }
+
+   /* generate depth and/or stencil test code */
+   if (dsa->depth.enabled || dsa->stencil[0].enabled) {
+      gen_depth_stencil(cell, dsa, f,
+                        facing,
+                        mask_reg,
+                        depth_tile_reg,
+                        quad_offset_reg,
+                        fragZ_reg);
+   }
+
+   /* Get framebuffer quad/colors.  We'll need these for blending,
+    * color masking, and to obey the quad/pixel mask.
+    * Load: fbRGBA_reg = memory[color_tile + quad_offset]
+    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
+    * we could skip this load.
+    */
+   spe_comment(f, 0, "Fetch quad colors from tile");
+   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
+
+   if (blend->blend_enable) {
+      spe_comment(f, 0, "Perform blending");
+      gen_blend(blend, blend_color, f, color_format,
+                fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
+   }
+
+   /*
+    * Write fragment colors to framebuffer/tile.
+    * This involves converting the fragment colors from float[4] to the
+    * tile's specific format and obeying the quad/pixel mask.
+    */
+   {
+      int rgba_reg = spe_allocate_available_register(f);
+
+      /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
+      gen_pack_colors(f, color_format,
+                      fragR_reg, fragG_reg, fragB_reg, fragA_reg,
+                      rgba_reg);
+
+      if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
+         gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
+      }
+
+      if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
+         gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
+      }
+
+      /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
+       * if (mask[i])
+       *    rgba[i] = rgba[i];
+       * else
+       *    rgba[i] = framebuffer[i];
+       */
+      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
+
+      /* Store updated quad in tile:
+       * memory[color_tile + quad_offset] = rgba_reg;
+       */
+      spe_comment(f, 0, "Store quad colors into color tile");
+      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
+
+      spe_release_register(f, rgba_reg);
+   }
+
+   //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+
+   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
+
+   spe_release_register(f, fbRGBA_reg);
+   spe_release_register(f, quad_offset_reg);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      char buffer[1024];
+      sprintf(buffer, "End %s-facing per-fragment ops: %d instructions", 
+         facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
+      spe_comment(f, -4, buffer);
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
new file mode 100644
index 0000000000..21b35d1faf
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_GEN_FRAGMENT_H
+#define CELL_GEN_FRAGMENT_H
+
+
+extern void
+cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f);
+
+
+#endif /* CELL_GEN_FRAGMENT_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
new file mode 100644
index 0000000000..ca358ed031
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -0,0 +1,358 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:
+ *  Keith Whitwell <keith@tungstengraphics.com>
+ *  Brian Paul
+ */
+
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "draw/draw_context.h"
+#include "cell_context.h"
+#include "cell_flush.h"
+#include "cell_pipe_state.h"
+#include "cell_state.h"
+#include "cell_texture.h"
+
+
+
+static void *
+cell_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{
+   return mem_dup(blend, sizeof(*blend));
+}
+
+
+static void
+cell_bind_blend_state(struct pipe_context *pipe, void *blend)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->blend = (struct pipe_blend_state *) blend;
+   cell->dirty |= CELL_NEW_BLEND;
+}
+
+
+static void
+cell_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+
+static void
+cell_set_blend_color(struct pipe_context *pipe,
+                     const struct pipe_blend_color *blend_color)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->blend_color = *blend_color;
+
+   cell->dirty |= CELL_NEW_BLEND;
+}
+
+
+
+
+static void *
+cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
+                 const struct pipe_depth_stencil_alpha_state *dsa)
+{
+   return mem_dup(dsa, sizeof(*dsa));
+}
+
+
+static void
+cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
+                                    void *dsa)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa;
+   cell->dirty |= CELL_NEW_DEPTH_STENCIL;
+}
+
+
+static void
+cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa)
+{
+   FREE(dsa);
+}
+
+
+static void
+cell_set_clip_state(struct pipe_context *pipe,
+                    const struct pipe_clip_state *clip)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(cell->draw, clip);
+}
+
+
+
+/* Called when driver state tracker notices changes to the viewport
+ * matrix:
+ */
+static void
+cell_set_viewport_state( struct pipe_context *pipe,
+                         const struct pipe_viewport_state *viewport )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->viewport = *viewport; /* struct copy */
+   cell->dirty |= CELL_NEW_VIEWPORT;
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(cell->draw, viewport);
+
+   /* Using tnl/ and vf/ modules is temporary while getting started.
+    * Full pipe will have vertex shader, vertex fetch of its own.
+    */
+}
+
+
+static void
+cell_set_scissor_state( struct pipe_context *pipe,
+                        const struct pipe_scissor_state *scissor )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   memcpy( &cell->scissor, scissor, sizeof(*scissor) );
+   cell->dirty |= CELL_NEW_SCISSOR;
+}
+
+
+static void
+cell_set_polygon_stipple( struct pipe_context *pipe,
+                          const struct pipe_poly_stipple *stipple )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   memcpy( &cell->poly_stipple, stipple, sizeof(*stipple) );
+   cell->dirty |= CELL_NEW_STIPPLE;
+}
+
+
+
+static void *
+cell_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   return mem_dup(rasterizer, sizeof(*rasterizer));
+}
+
+
+static void
+cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast)
+{
+   struct pipe_rasterizer_state *rasterizer =
+      (struct pipe_rasterizer_state *) rast;
+   struct cell_context *cell = cell_context(pipe);
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(cell->draw, rasterizer);
+
+   cell->rasterizer = rasterizer;
+
+   cell->dirty |= CELL_NEW_RASTERIZER;
+}
+
+
+static void
+cell_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer)
+{
+   FREE(rasterizer);
+}
+
+
+
+static void *
+cell_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   return mem_dup(sampler, sizeof(*sampler));
+}
+
+
+static void
+cell_bind_sampler_states(struct pipe_context *pipe,
+                         unsigned num, void **samplers)
+{
+   struct cell_context *cell = cell_context(pipe);
+   uint i, changed = 0x0;
+
+   assert(num <= CELL_MAX_SAMPLERS);
+
+   draw_flush(cell->draw);
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_state *new_samp = i < num ? samplers[i] : NULL;
+      if (cell->sampler[i] != new_samp) {
+         cell->sampler[i] = new_samp;
+         changed |= (1 << i);
+      }
+   }
+
+   if (changed) {
+      cell->dirty |= CELL_NEW_SAMPLER;
+      cell->dirty_samplers |= changed;
+   }
+}
+
+
+static void
+cell_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   FREE( sampler );
+}
+
+
+
+static void
+cell_set_sampler_textures(struct pipe_context *pipe,
+                          unsigned num, struct pipe_texture **texture)
+{
+   struct cell_context *cell = cell_context(pipe);
+   uint i, changed = 0x0;
+
+   assert(num <= CELL_MAX_SAMPLERS);
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_texture *new_tex = i < num ? texture[i] : NULL;
+      if ((struct pipe_texture *) cell->texture[i] != new_tex) {
+         pipe_texture_reference((struct pipe_texture **) &cell->texture[i],
+                                new_tex);
+         changed |= (1 << i);
+      }
+   }
+
+   cell->num_textures = num;
+
+   if (changed) {
+      cell->dirty |= CELL_NEW_TEXTURE;
+      cell->dirty_textures |= changed;
+   }
+}
+
+
+
+static void
+cell_set_framebuffer_state(struct pipe_context *pipe,
+                           const struct pipe_framebuffer_state *fb)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   if (1 /*memcmp(&cell->framebuffer, fb, sizeof(*fb))*/) {
+      struct pipe_surface *csurf = fb->cbufs[0];
+      struct pipe_surface *zsurf = fb->zsbuf;
+      uint i;
+      uint flags = (PIPE_BUFFER_USAGE_GPU_WRITE |
+                    PIPE_BUFFER_USAGE_GPU_READ);
+
+      /* unmap old surfaces */
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         if (cell->framebuffer.cbufs[i] && cell->cbuf_map[i]) {
+            pipe_surface_unmap(cell->framebuffer.cbufs[i]);
+            cell->cbuf_map[i] = NULL;
+         }
+      }
+
+      if (cell->framebuffer.zsbuf && cell->zsbuf_map) {
+         pipe_surface_unmap(cell->framebuffer.zsbuf);
+         cell->zsbuf_map = NULL;
+      }
+
+      /* Finish any pending rendering to the current surface before
+       * installing a new surface!
+       */
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+
+      /* update my state
+       * (this is also where old surfaces will finally get freed)
+       */
+      cell->framebuffer.width = fb->width;
+      cell->framebuffer.height = fb->height;
+      cell->framebuffer.nr_cbufs = fb->nr_cbufs;
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         pipe_surface_reference(&cell->framebuffer.cbufs[i], fb->cbufs[i]);
+      }
+      pipe_surface_reference(&cell->framebuffer.zsbuf, fb->zsbuf);
+
+      /* map new surfaces */
+      if (csurf)
+         cell->cbuf_map[0] = pipe_surface_map(csurf, flags);
+
+      if (zsurf)
+         cell->zsbuf_map = pipe_surface_map(zsurf, flags);
+
+      cell->dirty |= CELL_NEW_FRAMEBUFFER;
+   }
+}
+
+
+
+void
+cell_init_state_functions(struct cell_context *cell)
+{
+   cell->pipe.create_blend_state = cell_create_blend_state;
+   cell->pipe.bind_blend_state   = cell_bind_blend_state;
+   cell->pipe.delete_blend_state = cell_delete_blend_state;
+
+   cell->pipe.create_sampler_state = cell_create_sampler_state;
+   cell->pipe.bind_sampler_states = cell_bind_sampler_states;
+   cell->pipe.delete_sampler_state = cell_delete_sampler_state;
+
+   cell->pipe.set_sampler_textures = cell_set_sampler_textures;
+
+   cell->pipe.create_depth_stencil_alpha_state = cell_create_depth_stencil_alpha_state;
+   cell->pipe.bind_depth_stencil_alpha_state   = cell_bind_depth_stencil_alpha_state;
+   cell->pipe.delete_depth_stencil_alpha_state = cell_delete_depth_stencil_alpha_state;
+
+   cell->pipe.create_rasterizer_state = cell_create_rasterizer_state;
+   cell->pipe.bind_rasterizer_state   = cell_bind_rasterizer_state;
+   cell->pipe.delete_rasterizer_state = cell_delete_rasterizer_state;
+
+   cell->pipe.set_blend_color = cell_set_blend_color;
+   cell->pipe.set_clip_state = cell_set_clip_state;
+
+   cell->pipe.set_framebuffer_state = cell_set_framebuffer_state;
+
+   cell->pipe.set_polygon_stipple = cell_set_polygon_stipple;
+   cell->pipe.set_scissor_state = cell_set_scissor_state;
+   cell->pipe.set_viewport_state = cell_set_viewport_state;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.h b/src/gallium/drivers/cell/ppu/cell_pipe_state.h
new file mode 100644
index 0000000000..1889bd52ff
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_PIPE_STATE_H
+#define CELL_PIPE_STATE_H
+
+
+struct cell_context;
+
+extern void
+cell_init_state_functions(struct cell_context *cell);
+
+
+#endif /* CELL_PIPE_STATE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
new file mode 100644
index 0000000000..79cb8df82f
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -0,0 +1,211 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Last stage of 'draw' pipeline: send tris to SPUs.
+ * \author  Brian Paul
+ */
+
+#include "cell_context.h"
+#include "cell_render.h"
+#include "cell_spu.h"
+#include "util/u_memory.h"
+#include "draw/draw_private.h"
+
+
+struct render_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct cell_context *cell;
+};
+
+
+static INLINE struct render_stage *
+render_stage(struct draw_stage *stage)
+{
+   return (struct render_stage *) stage;
+}
+
+
+static void render_begin( struct draw_stage *stage )
+{
+#if 0
+   struct render_stage *render = render_stage(stage);
+   struct cell_context *sp = render->cell;
+   const struct pipe_shader_state *fs = &render->cell->fs->shader;
+   render->quad.nr_attrs = render->cell->nr_frag_attrs;
+
+   render->firstFpInput = fs->input_semantic_name[0];
+
+   sp->quad.first->begin(sp->quad.first);
+#endif
+}
+
+
+static void render_end( struct draw_stage *stage )
+{
+}
+
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+   struct render_stage *render = render_stage(stage);
+   /*render->cell->line_stipple_counter = 0;*/
+}
+
+
+static void
+render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+}
+
+
+static void
+render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+}
+
+
+/** Write a vertex into the prim buffer */
+static void
+save_vertex(struct cell_prim_buffer *buf, uint pos,
+            const struct vertex_header *vert)
+{
+   uint attr, j;
+
+   for (attr = 0; attr < 2; attr++) {
+      for (j = 0; j < 4; j++) {
+         buf->vertex[pos][attr][j] = vert->data[attr][j];
+      }
+   }
+
+   /* update bounding box */
+   if (vert->data[0][0] < buf->xmin)
+      buf->xmin = vert->data[0][0];
+   if (vert->data[0][0] > buf->xmax)
+      buf->xmax = vert->data[0][0];
+   if (vert->data[0][1] < buf->ymin)
+      buf->ymin = vert->data[0][1];
+   if (vert->data[0][1] > buf->ymax)
+      buf->ymax = vert->data[0][1];
+}
+
+
+static void
+render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct render_stage *rs = render_stage(stage);
+   struct cell_context *cell = rs->cell;
+   struct cell_prim_buffer *buf = &cell->prim_buffer;
+   uint i;
+
+   if (buf->num_verts + 3 > CELL_MAX_VERTS) {
+      cell_flush_prim_buffer(cell);
+   }
+
+   i = buf->num_verts;
+   assert(i+2 <= CELL_MAX_VERTS);
+   save_vertex(buf, i+0, prim->v[0]);
+   save_vertex(buf, i+1, prim->v[1]);
+   save_vertex(buf, i+2, prim->v[2]);
+   buf->num_verts += 3;
+}
+
+
+/**
+ * Send the a RENDER command to all SPUs to have them render the prims
+ * in the current prim_buffer.
+ */
+void
+cell_flush_prim_buffer(struct cell_context *cell)
+{
+   uint i;
+
+   if (cell->prim_buffer.num_verts == 0)
+      return;
+
+   for (i = 0; i < cell->num_spus; i++) {
+      struct cell_command_render *render = &cell_global.command[i].render;
+      render->prim_type = PIPE_PRIM_TRIANGLES;
+      render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
+      render->vertex_size = cell->vertex_info->size * 4;
+      render->xmin = cell->prim_buffer.xmin;
+      render->ymin = cell->prim_buffer.ymin;
+      render->xmax = cell->prim_buffer.xmax;
+      render->ymax = cell->prim_buffer.ymax;
+      render->vertex_data = &cell->prim_buffer.vertex;
+      ASSERT_ALIGN16(render->vertex_data);
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_RENDER);
+   }
+
+   cell->prim_buffer.num_verts = 0;
+
+   cell->prim_buffer.xmin = 1e100;
+   cell->prim_buffer.ymin = 1e100;
+   cell->prim_buffer.xmax = -1e100;
+   cell->prim_buffer.ymax = -1e100;
+
+   /* XXX temporary, need to double-buffer the prim buffer until we get
+    * a real command buffer/list system.
+    */
+   cell_flush(&cell->pipe, 0x0);
+}
+
+
+
+static void render_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create a new draw/render stage.  This will be plugged into the
+ * draw module as the last pipeline stage.
+ */
+struct draw_stage *cell_draw_render_stage( struct cell_context *cell )
+{
+   struct render_stage *render = CALLOC_STRUCT(render_stage);
+
+   render->cell = cell;
+   render->stage.draw = cell->draw;
+   render->stage.begin = render_begin;
+   render->stage.point = render_point;
+   render->stage.line = render_line;
+   render->stage.tri = render_tri;
+   render->stage.end = render_end;
+   render->stage.reset_stipple_counter = reset_stipple_counter;
+   render->stage.destroy = render_destroy;
+
+   /*
+   render->quad.coef = render->coef;
+   render->quad.posCoef = &render->posCoef;
+   */
+
+   return &render->stage;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_render.h b/src/gallium/drivers/cell/ppu/cell_render.h
new file mode 100644
index 0000000000..826dcbafeb
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_render.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_RENDER_H
+#define CELL_RENDER_H
+
+struct cell_context;
+struct draw_stage;
+
+extern void
+cell_flush_prim_buffer(struct cell_context *cell);
+
+extern struct draw_stage *cell_draw_render_stage( struct cell_context *cell );
+
+#endif /* CELL_RENDER_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
new file mode 100644
index 0000000000..512d85d352
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -0,0 +1,176 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_simple_screen.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+
+#include "cell/common.h"
+#include "cell_screen.h"
+#include "cell_texture.h"
+#include "cell_winsys.h"
+
+
+static const char *
+cell_get_vendor(struct pipe_screen *screen)
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+cell_get_name(struct pipe_screen *screen)
+{
+   return "Cell";
+}
+
+
+static int
+cell_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return CELL_MAX_SAMPLERS;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 1;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 10;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return CELL_MAX_TEXTURE_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return CELL_MAX_TEXTURE_LEVELS;
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+      return 1; /* XXX not really true */
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 0; /* XXX to do */
+   default:
+      return 0;
+   }
+}
+
+
+static float
+cell_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 0.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+
+   default:
+      return 0;
+   }
+}
+
+
+static boolean
+cell_is_format_supported( struct pipe_screen *screen,
+                          enum pipe_format format, 
+                          enum pipe_texture_target target,
+                          unsigned tex_usage, 
+                          unsigned geom_flags )
+{
+   /* cell supports most formats, XXX for now anyway */
+   if (format == PIPE_FORMAT_DXT5_RGBA ||
+       format == PIPE_FORMAT_R8G8B8A8_SRGB)
+      return FALSE;
+   else
+      return TRUE;
+}
+
+
+static void
+cell_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no cell_screen) but
+ * that would be the place to put SPU thread/context info...
+ */
+struct pipe_screen *
+cell_create_screen(struct pipe_winsys *winsys)
+{
+   struct pipe_screen *screen = CALLOC_STRUCT(pipe_screen);
+
+   if (!screen)
+      return NULL;
+
+   screen->winsys = winsys;
+
+   screen->destroy = cell_destroy_screen;
+
+   screen->get_name = cell_get_name;
+   screen->get_vendor = cell_get_vendor;
+   screen->get_param = cell_get_param;
+   screen->get_paramf = cell_get_paramf;
+   screen->is_format_supported = cell_is_format_supported;
+
+   cell_init_screen_texture_funcs(screen);
+   u_simple_screen_init(screen);
+
+   return screen;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.h b/src/gallium/drivers/cell/ppu/cell_screen.h
new file mode 100644
index 0000000000..c7e15889d6
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_screen.h
@@ -0,0 +1,41 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_SCREEN_H
+#define CELL_SCREEN_H
+
+
+struct pipe_screen;
+struct pipe_winsys;
+
+
+extern struct pipe_screen *
+cell_create_screen(struct pipe_winsys *winsys);
+
+
+#endif /* CELL_SCREEN_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
new file mode 100644
index 0000000000..28e5e6d706
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -0,0 +1,219 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Utility/wrappers for communicating with the SPUs.
+ */
+
+
+#include <pthread.h>
+
+#include "cell_spu.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+#include "cell/common.h"
+
+
+/*
+helpful headers:
+/opt/ibm/cell-sdk/prototype/src/include/ppu/cbe_mfc.h
+*/
+
+
+/**
+ * Cell/SPU info that's not per-context.
+ */
+struct cell_global_info cell_global;
+
+
+/**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+   FILE *f = fopen("/proc/cpuinfo", "r");
+   unsigned timebase;
+
+   assert(f);
+   while (!feof(f)) {
+      char line[80];
+      fgets(line, sizeof(line), f);
+      if (strncmp(line, "timebase", 8) == 0) {
+         char *colon = strchr(line, ':');
+         if (colon) {
+            timebase = atoi(colon + 2);
+            break;
+         }
+      }
+   }
+   fclose(f);
+
+   return timebase;
+}
+
+
+/**
+ * Write a 1-word message to the given SPE mailbox.
+ */
+void
+send_mbox_message(spe_context_ptr_t ctx, unsigned int msg)
+{
+   spe_in_mbox_write(ctx, &msg, 1, SPE_MBOX_ALL_BLOCKING);
+}
+
+
+/**
+ * Wait for a 1-word message to arrive in given mailbox.
+ */
+uint
+wait_mbox_message(spe_context_ptr_t ctx)
+{
+   do {
+      unsigned data;
+      int count = spe_out_mbox_read(ctx, &data, 1);
+
+      if (count == 1) {
+	 return data;
+      }
+      
+      if (count < 0) {
+	 /* error */ ;
+      }
+   } while (1);
+}
+
+
+/**
+ * Called by pthread_create() to spawn an SPU thread.
+ */
+static void *
+cell_thread_function(void *arg)
+{
+   struct cell_init_info *init = (struct cell_init_info *) arg;
+   unsigned entry = SPE_DEFAULT_ENTRY;
+
+   ASSERT_ALIGN16(init);
+
+   if (spe_context_run(cell_global.spe_contexts[init->id], &entry, 0,
+                       init, NULL, NULL) < 0) {
+      fprintf(stderr, "spe_context_run() failed\n");
+      exit(1);
+   }
+
+   pthread_exit(NULL);
+}
+
+
+/**
+ * Create the SPU threads.  This is done once during driver initialization.
+ * This involves setting the the "init" message which is sent to each SPU.
+ * The init message specifies an SPU id, total number of SPUs, location
+ * and number of batch buffers, etc.
+ */
+void
+cell_start_spus(struct cell_context *cell)
+{
+   static boolean one_time_init = FALSE;
+   uint i, j;
+   uint timebase = get_timebase();
+
+   if (one_time_init) {
+      fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
+	      "on Cell.\n");
+      abort();
+   }
+
+   one_time_init = TRUE;
+
+   assert(cell->num_spus <= CELL_MAX_SPUS);
+
+   ASSERT_ALIGN16(&cell_global.inits[0]);
+   ASSERT_ALIGN16(&cell_global.inits[1]);
+
+   /*
+    * Initialize the global 'inits' structure for each SPU.
+    * A pointer to the init struct will be passed to each SPU.
+    * The SPUs will then each grab their init info with mfc_get().
+    */
+   for (i = 0; i < cell->num_spus; i++) {
+      cell_global.inits[i].id = i;
+      cell_global.inits[i].num_spus = cell->num_spus;
+      cell_global.inits[i].debug_flags = cell->debug_flags;
+      cell_global.inits[i].inv_timebase = 1000.0f / timebase;
+
+      for (j = 0; j < CELL_NUM_BUFFERS; j++) {
+         cell_global.inits[i].buffers[j] = cell->buffer[j];
+      }
+      cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
+
+      cell_global.inits[i].spu_functions = &cell->spu_functions;
+
+      cell_global.spe_contexts[i] = spe_context_create(0, NULL);
+      if (!cell_global.spe_contexts[i]) {
+         fprintf(stderr, "spe_context_create() failed\n");
+         exit(1);
+      }
+
+      if (spe_program_load(cell_global.spe_contexts[i], &g3d_spu)) {
+         fprintf(stderr, "spe_program_load() failed\n");
+         exit(1);
+      }
+      
+      pthread_create(&cell_global.spe_threads[i], /* returned thread handle */
+                     NULL,                        /* pthread attribs */
+                     &cell_thread_function,       /* start routine */
+		     &cell_global.inits[i]);      /* thread argument */
+   }
+}
+
+
+/**
+ * Tell all the SPUs to stop/exit.
+ * This is done when the driver's exiting / cleaning up.
+ */
+void
+cell_spu_exit(struct cell_context *cell)
+{
+   uint i;
+
+   for (i = 0; i < cell->num_spus; i++) {
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_EXIT);
+   }
+
+   /* wait for threads to exit */
+   for (i = 0; i < cell->num_spus; i++) {
+      void *value;
+      pthread_join(cell_global.spe_threads[i], &value);
+      cell_global.spe_threads[i] = 0;
+      cell_global.spe_contexts[i] = 0;
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
new file mode 100644
index 0000000000..c93958a9ed
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_SPU
+#define CELL_SPU
+
+
+#include <libspe2.h>
+#include <pthread.h>
+#include "cell/common.h"
+
+#include "cell_context.h"
+
+
+/**
+ * Global vars, for now anyway.
+ */
+struct cell_global_info
+{
+   /**
+    * SPU/SPE handles, etc
+    */
+   spe_context_ptr_t spe_contexts[CELL_MAX_SPUS];
+   pthread_t spe_threads[CELL_MAX_SPUS];
+
+   /**
+    * Data sent to SPUs at start-up
+    */
+   struct cell_init_info inits[CELL_MAX_SPUS];
+};
+
+
+extern struct cell_global_info cell_global;
+
+
+/** This is the handle for the actual SPE code */
+extern spe_program_handle_t g3d_spu;
+
+
+extern void
+send_mbox_message(spe_context_ptr_t ctx, unsigned int msg);
+
+extern uint
+wait_mbox_message(spe_context_ptr_t ctx);
+
+
+extern void
+cell_start_spus(struct cell_context *cell);
+
+
+extern void
+cell_spu_exit(struct cell_context *cell);
+
+
+#endif /* CELL_SPU */
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
new file mode 100644
index 0000000000..b193170f9c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_STATE_H
+#define CELL_STATE_H
+
+
+#define CELL_NEW_VIEWPORT      0x1
+#define CELL_NEW_RASTERIZER    0x2
+#define CELL_NEW_FS            0x4
+#define CELL_NEW_BLEND         0x8
+#define CELL_NEW_CLIP          0x10
+#define CELL_NEW_SCISSOR       0x20
+#define CELL_NEW_STIPPLE       0x40
+#define CELL_NEW_FRAMEBUFFER   0x80
+#define CELL_NEW_ALPHA_TEST    0x100
+#define CELL_NEW_DEPTH_STENCIL 0x200
+#define CELL_NEW_SAMPLER       0x400
+#define CELL_NEW_TEXTURE       0x800
+#define CELL_NEW_VERTEX        0x1000
+#define CELL_NEW_VS            0x2000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
+
+
+extern void
+cell_update_derived( struct cell_context *softpipe );
+
+
+extern void
+cell_init_shader_functions(struct cell_context *cell);
+
+
+extern void
+cell_init_vertex_functions(struct cell_context *cell);
+
+
+#endif /* CELL_STATE_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
new file mode 100644
index 0000000000..efc4f78364
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -0,0 +1,170 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_state.h"
+#include "cell_state_emit.h"
+
+
+/**
+ * Determine how to map vertex program outputs to fragment program inputs.
+ * Basically, this will be used when computing the triangle interpolation
+ * coefficients from the post-transform vertex attributes.
+ */
+static void
+calculate_vertex_layout( struct cell_context *cell )
+{
+   const struct cell_fragment_shader_state *fs = cell->fs;
+   const enum interp_mode colorInterp
+      = cell->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+   struct vertex_info *vinfo = &cell->vertex_info;
+   uint i;
+   int src;
+
+#if 0
+   if (cell->vbuf) {
+      /* if using the post-transform vertex buffer, tell draw_vbuf to
+       * simply emit the whole post-xform vertex as-is:
+       */
+      struct vertex_info *vinfo_vbuf = &cell->vertex_info_vbuf;
+      vinfo_vbuf->num_attribs = 0;
+      draw_emit_vertex_attr(vinfo_vbuf, EMIT_ALL, INTERP_NONE, 0);
+      vinfo_vbuf->size = 4 * vs->num_outputs + sizeof(struct vertex_header)/4;
+   }
+#endif
+
+   /* reset vinfo */
+   vinfo->num_attribs = 0;
+
+   /* we always want to emit vertex pos */
+   src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
+   assert(src >= 0);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
+
+
+   /*
+    * Loop over fragment shader inputs, searching for the matching output
+    * from the vertex shader.
+    */
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      switch (fs->info.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         /* already done above */
+         break;
+
+      case TGSI_SEMANTIC_COLOR:
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_COLOR, 
+                                   fs->info.input_semantic_index[i]);
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         break;
+
+      case TGSI_SEMANTIC_FOG:
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
+#if 1
+         if (src < 0) /* XXX temp hack, try demos/fogcoord.c with this */
+            src = 0;
+#endif
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         break;
+
+      case TGSI_SEMANTIC_GENERIC:
+         /* this includes texcoords and varying vars */
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_GENERIC,
+                              fs->info.input_semantic_index[i]);
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   draw_compute_vertex_size(vinfo);
+
+   /* XXX only signal this if format really changes */
+   cell->dirty |= CELL_NEW_VERTEX_INFO;
+}
+
+
+#if 0
+/**
+ * Recompute cliprect from scissor bounds, scissor enable and surface size.
+ */
+static void
+compute_cliprect(struct cell_context *sp)
+{
+   uint surfWidth = sp->framebuffer.width;
+   uint surfHeight = sp->framebuffer.height;
+
+   if (sp->rasterizer->scissor) {
+      /* clip to scissor rect */
+      sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
+      sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
+      sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
+      sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight);
+   }
+   else {
+      /* clip to surface bounds */
+      sp->cliprect.minx = 0;
+      sp->cliprect.miny = 0;
+      sp->cliprect.maxx = surfWidth;
+      sp->cliprect.maxy = surfHeight;
+   }
+}
+#endif
+
+
+
+/**
+ * Update derived state, send current state to SPUs prior to rendering.
+ */
+void cell_update_derived( struct cell_context *cell )
+{
+   if (cell->dirty & (CELL_NEW_RASTERIZER |
+                      CELL_NEW_FS |
+                      CELL_NEW_VS))
+      calculate_vertex_layout( cell );
+
+#if 0
+   if (cell->dirty & (CELL_NEW_SCISSOR |
+                      CELL_NEW_DEPTH_STENCIL_ALPHA |
+                      CELL_NEW_FRAMEBUFFER))
+      compute_cliprect(cell);
+#endif
+
+   cell_emit_state(cell);
+
+   cell->dirty = 0;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
new file mode 100644
index 0000000000..ff529fe22c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -0,0 +1,340 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+#include "cell_state.h"
+#include "cell_state_emit.h"
+#include "cell_batch.h"
+#include "cell_texture.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+   struct cell_fragment_ops_key key;
+   struct cell_command_fragment_ops *ops;
+
+   /*
+    * Build key
+    */
+   memset(&key, 0, sizeof(key));
+   key.blend = *cell->blend;
+   key.blend_color = cell->blend_color;
+   key.dsa = *cell->depth_stencil;
+
+   if (cell->framebuffer.cbufs[0])
+      key.color_format = cell->framebuffer.cbufs[0]->format;
+   else
+      key.color_format = PIPE_FORMAT_NONE;
+
+   if (cell->framebuffer.zsbuf)
+      key.zs_format = cell->framebuffer.zsbuf->format;
+   else
+      key.zs_format = PIPE_FORMAT_NONE;
+
+   /*
+    * Look up key in cache.
+    */
+   ops = (struct cell_command_fragment_ops *)
+      util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+   /*
+    * If not found, create/save new fragment ops command.
+    */
+   if (!ops) {
+      struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
+
+      if (0)
+         debug_printf("**** Create New Fragment Ops\n");
+
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
+
+      /* Generate new code.  Always generate new code for both front-facing
+       * and back-facing fragments, even if it's the same code in both
+       * cases.
+       */
+      cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+      cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
+
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
+
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
+      /* populate the new cell_command_fragment_ops object */
+      ops->opcode[0] = CELL_CMD_STATE_FRAGMENT_OPS;
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
+      ops->dsa = *cell->depth_stencil;
+      ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
+
+      /* insert cell_command_fragment_ops object into keymap/cache */
+      util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+      /* release rtasm buffer */
+      spe_release_func(&spe_code_front);
+      spe_release_func(&spe_code_back);
+   }
+   else {
+      if (0)
+         debug_printf("**** Re-use Fragment Ops\n");
+   }
+
+   return ops;
+}
+
+
+
+static void
+emit_state_cmd(struct cell_context *cell, uint cmd,
+               const void *state, uint state_size)
+{
+   uint32_t *dst = (uint32_t *) 
+       cell_batch_alloc16(cell, ROUNDUP16(sizeof(opcode_t) + state_size));
+   *dst = cmd;
+   memcpy(dst + 4, state, state_size);
+}
+
+
+/**
+ * For state marked as 'dirty', construct a state-update command block
+ * and insert it into the current batch buffer.
+ */
+void
+cell_emit_state(struct cell_context *cell)
+{
+   if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
+      struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
+      struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
+      STATIC_ASSERT(sizeof(struct cell_command_framebuffer) % 16 == 0);
+      struct cell_command_framebuffer *fb
+         = cell_batch_alloc16(cell, sizeof(*fb));
+      fb->opcode[0] = CELL_CMD_STATE_FRAMEBUFFER;
+      fb->color_start = cell->cbuf_map[0];
+      fb->color_format = cbuf->format;
+      fb->depth_start = cell->zsbuf_map;
+      fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;
+      fb->width = cell->framebuffer.width;
+      fb->height = cell->framebuffer.height;
+#if 0
+      printf("EMIT color format %s\n", pf_name(fb->color_format));
+      printf("EMIT depth format %s\n", pf_name(fb->depth_format));
+#endif
+   }
+
+   if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      STATIC_ASSERT(sizeof(struct cell_command_rasterizer) % 16 == 0);
+      struct cell_command_rasterizer *rast =
+         cell_batch_alloc16(cell, sizeof(*rast));
+      rast->opcode[0] = CELL_CMD_STATE_RASTERIZER;
+      rast->rasterizer = *cell->rasterizer;
+   }
+
+   if (cell->dirty & (CELL_NEW_FS)) {
+      /* Send new fragment program to SPUs */
+      STATIC_ASSERT(sizeof(struct cell_command_fragment_program) % 16 == 0);
+      struct cell_command_fragment_program *fp
+            = cell_batch_alloc16(cell, sizeof(*fp));
+      fp->opcode[0] = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+      fp->num_inst = cell->fs->code.num_inst;
+      memcpy(&fp->code, cell->fs->code.store,
+             SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+      if (0) {
+         int i;
+         printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n");
+         for (i = 0; i < fp->num_inst; i++) {
+            printf(" %3d: 0x%08x\n", i, fp->code[i]);
+         }
+      }
+   }
+
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].buffer->size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float)));
+      uint32_t *ibuf = (uint32_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[4] = num_const;
+      j = 8;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
+                      CELL_NEW_DEPTH_STENCIL |
+                      CELL_NEW_BLEND)) {
+      struct cell_command_fragment_ops *fops, *fops_cmd;
+      /* Note that cell_command_fragment_ops is a variant-sized record */
+      fops = lookup_fragment_ops(cell);
+      fops_cmd = cell_batch_alloc16(cell, ROUNDUP16(sizeof(*fops_cmd) + fops->total_code_size));
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
+   }
+
+   if (cell->dirty & CELL_NEW_SAMPLER) {
+      uint i;
+      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->dirty_samplers & (1 << i)) {
+            if (cell->sampler[i]) {
+               STATIC_ASSERT(sizeof(struct cell_command_sampler) % 16 == 0);
+               struct cell_command_sampler *sampler
+                  = cell_batch_alloc16(cell, sizeof(*sampler));
+               sampler->opcode[0] = CELL_CMD_STATE_SAMPLER;
+               sampler->unit = i;
+               sampler->state = *cell->sampler[i];
+            }
+         }
+      }
+      cell->dirty_samplers = 0x0;
+   }
+
+   if (cell->dirty & CELL_NEW_TEXTURE) {
+      uint i;
+      for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->dirty_textures & (1 << i)) {
+            STATIC_ASSERT(sizeof(struct cell_command_texture) % 16 == 0);
+            struct cell_command_texture *texture
+               =  (struct cell_command_texture *)cell_batch_alloc16(cell, sizeof(*texture));
+            texture->opcode[0] = CELL_CMD_STATE_TEXTURE;
+            texture->unit = i;
+            if (cell->texture[i]) {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = cell->texture[i]->tiled_mapped[level];
+                  texture->width[level] = cell->texture[i]->base.width[level];
+                  texture->height[level] = cell->texture[i]->base.height[level];
+                  texture->depth[level] = cell->texture[i]->base.depth[level];
+               }
+               texture->target = cell->texture[i]->base.target;
+            }
+            else {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = NULL;
+                  texture->width[level] = 0;
+                  texture->height[level] = 0;
+                  texture->depth[level] = 0;
+               }
+               texture->target = 0;
+            }
+         }
+      }
+      cell->dirty_textures = 0x0;
+   }
+
+   if (cell->dirty & CELL_NEW_VERTEX_INFO) {
+      emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
+                     &cell->vertex_info, sizeof(struct vertex_info));
+   }
+
+#if 0
+   if (cell->dirty & CELL_NEW_VS) {
+      const struct draw_context *const draw = cell->draw;
+      struct cell_shader_info info;
+
+      info.num_outputs = draw_num_vs_outputs(draw);
+      info.declarations = (uintptr_t) draw->vs.machine.Declarations;
+      info.num_declarations = draw->vs.machine.NumDeclarations;
+      info.instructions = (uintptr_t) draw->vs.machine.Instructions;
+      info.num_instructions = draw->vs.machine.NumInstructions;
+      info.immediates = (uintptr_t) draw->vs.machine.Imms;
+      info.num_immediates = draw->vs.machine.ImmLimit / 4;
+
+      emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info));
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.h b/src/gallium/drivers/cell/ppu/cell_state_emit.h
new file mode 100644
index 0000000000..59f8affe8d
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.h
@@ -0,0 +1,36 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_STATE_EMIT_H
+#define CELL_STATE_EMIT_H
+
+
+extern void
+cell_emit_state(struct cell_context *cell);
+
+
+#endif /* CELL_STATE_EMIT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
new file mode 100644
index 0000000000..d97c22b2ef
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -0,0 +1,1430 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Generate code to perform all per-fragment operations.
+ *
+ * Code generated by these functions perform both alpha, depth, and stencil
+ * testing as well as alpha blending.
+ *
+ * \note
+ * Occlusion query is not supported, but this is the right place to add that
+ * support.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "cell_context.h"
+
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Generate code to perform alpha testing.
+ *
+ * The code generated by this function uses the register specificed by
+ * \c mask as both an input and an output.
+ *
+ * \param dsa    Current alpha-test state
+ * \param f      Function to which code should be appended
+ * \param mask   Index of register containing active fragment mask
+ * \param alphas Index of register containing per-fragment alpha values
+ *
+ * \note Emits a maximum of 6 instructions.
+ */
+static void
+emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int alphas)
+{
+   /* If the alpha function is either NEVER or ALWAYS, there is no need to
+    * load the reference value into a register.  ALWAYS is a fairly common
+    * case, and this optimization saves 2 instructions.
+    */
+   if (dsa->alpha.enabled
+       && (dsa->alpha.func != PIPE_FUNC_NEVER)
+       && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      int ref = spe_allocate_available_register(f);
+      int tmp_a = spe_allocate_available_register(f);
+      int tmp_b = spe_allocate_available_register(f);
+      union {
+         float f;
+         unsigned u;
+      } ref_val;
+      boolean complement = FALSE;
+
+      ref_val.f = dsa->alpha.ref;
+
+      spe_il(f, ref, ref_val.u & 0x0000ffff);
+      spe_ilh(f, ref, ref_val.u >> 16);
+
+      switch (dsa->alpha.func) {
+      case PIPE_FUNC_NOTEQUAL:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_EQUAL:
+         spe_fceq(f, tmp_a, ref, alphas);
+         break;
+
+      case PIPE_FUNC_LEQUAL:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_GREATER:
+         spe_fcgt(f, tmp_a, ref, alphas);
+         break;
+
+      case PIPE_FUNC_LESS:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_GEQUAL:
+         spe_fcgt(f, tmp_a, ref, alphas);
+         spe_fceq(f, tmp_b, ref, alphas);
+         spe_or(f, tmp_a, tmp_b, tmp_a);
+         break;
+
+      case PIPE_FUNC_ALWAYS:
+      case PIPE_FUNC_NEVER:
+      default:
+         assert(0);
+         break;
+      }
+
+      if (complement) {
+         spe_andc(f, mask, mask, tmp_a);
+      } else {
+         spe_and(f, mask, mask, tmp_a);
+      }
+
+      spe_release_register(f, ref);
+      spe_release_register(f, tmp_a);
+      spe_release_register(f, tmp_b);
+   } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) {
+      spe_il(f, mask, 0);
+   }
+}
+
+
+/**
+ * Generate code to perform Z testing.  Four Z values are tested at once.
+ * \param dsa        Current depth-test state
+ * \param f          Function to which code should be appended
+ * \param mask       Index of register to contain depth-pass mask
+ * \param stored     Index of register containing values from depth buffer
+ * \param calculated Index of register containing per-fragment depth values
+ *
+ * \return
+ * If the calculated depth comparison mask is the actual mask, \c FALSE is
+ * returned.  If the calculated depth comparison mask is the compliment of
+ * the actual mask, \c TRUE is returned.
+ *
+ * \note Emits a maximum of 3 instructions.
+ */
+static boolean
+emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int stored, int calculated)
+{
+   unsigned func = (dsa->depth.enabled)
+       ? dsa->depth.func : PIPE_FUNC_ALWAYS;
+   int tmp = spe_allocate_available_register(f);
+   boolean compliment = FALSE;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask, 0);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      spe_ceq(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      spe_clgt(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LESS:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      spe_clgt(f, mask, calculated, stored);
+      spe_ceq(f, tmp, calculated, stored);
+      spe_or(f, mask, mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      spe_il(f, mask, ~0);
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   spe_release_register(f, tmp);
+   return compliment;
+}
+
+
+/**
+ * Generate code to apply the stencil operation (after testing).
+ * \note Emits a maximum of 5 instructions.
+ *
+ * \warning
+ * Since \c out and \c in might be the same register, this routine cannot
+ * generate code that uses \c out as a temporary.
+ */
+static void
+emit_stencil_op(struct spe_function *f,
+                int out, int in, int mask, unsigned op, unsigned ref)
+{
+   const int clamp = spe_allocate_available_register(f);
+   const int clamp_mask = spe_allocate_available_register(f);
+   const int result = spe_allocate_available_register(f);
+
+   switch(op) {
+   case PIPE_STENCIL_OP_KEEP:
+      assert(0);
+   case PIPE_STENCIL_OP_ZERO:
+      spe_il(f, result, 0);
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      spe_il(f, result, ref);
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      /* clamp = [0xff, 0xff, 0xff, 0xff] */
+      spe_il(f, clamp, 0x0ff);
+      /* result[i] = in[i] + 1 */
+      spe_ai(f, result, in, 1);
+      /* clamp_mask[i] = (result[i] > 0xff) */
+      spe_clgti(f, clamp_mask, result, 0x0ff);
+      /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */
+      spe_selb(f, result, result, clamp, clamp_mask);
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      spe_il(f, clamp, 0);
+      spe_ai(f, result, in, -1);
+
+      /* If "(s-1) < 0" in signed arithemtic, then "(s-1) > MAX" in unsigned
+       * arithmetic.
+       */
+      spe_clgti(f, clamp_mask, result, 0x0ff);
+      spe_selb(f, result, result, clamp, clamp_mask);
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      spe_ai(f, result, in, 1);
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      spe_ai(f, result, in, -1);
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      spe_nor(f, result, in, in);
+      break;
+   default:
+      assert(0);
+   }
+
+   spe_selb(f, out, in, result, mask);
+
+   spe_release_register(f, result);
+   spe_release_register(f, clamp_mask);
+   spe_release_register(f, clamp);
+}
+
+
+/**
+ * Generate code to do stencil test.  Four pixels are tested at once.
+ * \param dsa        Depth / stencil test state
+ * \param face       0 for front face, 1 for back face
+ * \param f          Function to append instructions to
+ * \param mask       Register containing mask of fragments passing the
+ *                   alpha test
+ * \param depth_mask Register containing mask of fragments passing the
+ *                   depth test
+ * \param depth_compliment  Is \c depth_mask the compliment of the actual mask?
+ * \param stencil    Register containing values from stencil buffer
+ * \param depth_pass Register to store mask of fragments passing stencil test
+ *                   and depth test
+ *
+ * \note
+ * Emits a maximum of 10 + (3 * 5) = 25 instructions.
+ */
+static int
+emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
+                  unsigned face,
+                  struct spe_function *f,
+                  int mask,
+                  int depth_mask,
+                  boolean depth_complement,
+                  int stencil,
+                  int depth_pass)
+{
+   int stencil_fail = spe_allocate_available_register(f);
+   int depth_fail = spe_allocate_available_register(f);
+   int stencil_mask = spe_allocate_available_register(f);
+   int stencil_pass = spe_allocate_available_register(f);
+   int face_stencil = spe_allocate_available_register(f);
+   int stencil_src = stencil;
+   const unsigned ref = (dsa->stencil[face].ref_value
+                         & dsa->stencil[face].valuemask);
+   boolean complement = FALSE;
+   int stored;
+   int tmp = spe_allocate_available_register(f);
+
+
+   if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+       && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+       && (dsa->stencil[face].valuemask != 0x0ff)) {
+      stored = spe_allocate_available_register(f);
+      spe_andi(f, stored, stencil, dsa->stencil[face].valuemask);
+   } else {
+      stored = stencil;
+   }
+
+
+   switch (dsa->stencil[face].func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, stencil_mask, 0);   /* stencil_mask[0..3] = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      /* stencil_mask[i] = (stored[i] == ref) */
+      spe_ceqi(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      complement = TRUE;
+      /* stencil_mask[i] = (stored[i] > ref) */
+      spe_clgti(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LESS:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      /* stencil_mask[i] = (stored[i] > ref) */
+      spe_clgti(f, stencil_mask, stored, ref);
+      /* tmp[i] = (stored[i] == ref) */
+      spe_ceqi(f, tmp, stored, ref);
+      /* stencil_mask[i] = stencil_mask[i] | tmp[i] */
+      spe_or(f, stencil_mask, stencil_mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* See comment below. */
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   if (stored != stencil) {
+      spe_release_register(f, stored);
+   }
+   spe_release_register(f, tmp);
+
+
+   /* ALWAYS is a very common stencil-test, so some effort is applied to
+    * optimize that case.  The stencil-pass mask is the same as the input
+    * fragment mask.  This makes the stencil-test (above) a no-op, and the
+    * input fragment mask can be "renamed" the stencil-pass mask.
+    */
+   if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) {
+      spe_release_register(f, stencil_pass);
+      stencil_pass = mask;
+   } else {
+      if (complement) {
+         spe_andc(f, stencil_pass, mask, stencil_mask);
+      } else {
+         spe_and(f, stencil_pass, mask, stencil_mask);
+      }
+   }
+
+   if (depth_complement) {
+      spe_andc(f, depth_pass, stencil_pass, depth_mask);
+   } else {
+      spe_and(f, depth_pass, stencil_pass, depth_mask);
+   }
+
+
+   /* Conditionally emit code to update the stencil value under various
+    * condititons.  Note that there is no need to generate code under the
+    * following circumstances:
+    *
+    * - Stencil write mask is zero.
+    * - For stencil-fail if the stencil test is ALWAYS
+    * - For depth-fail if the stencil test is NEVER
+    * - For depth-pass if the stencil test is NEVER
+    * - Any of the 3 conditions if the operation is KEEP
+    */
+   if (dsa->stencil[face].writemask != 0) {
+      if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+          && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (complement) {
+            spe_and(f, stencil_fail, mask, stencil_mask);
+         } else {
+            spe_andc(f, stencil_fail, mask, stencil_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, stencil_fail,
+                         dsa->stencil[face].fail_op,
+                         dsa->stencil[face].ref_value);
+
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (depth_complement) {
+            spe_and(f, depth_fail, stencil_pass, depth_mask);
+         } else {
+            spe_andc(f, depth_fail, stencil_pass, depth_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, depth_fail,
+                         dsa->stencil[face].zfail_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) {
+         emit_stencil_op(f, face_stencil, stencil_src, depth_pass,
+                         dsa->stencil[face].zpass_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+   }
+
+   spe_release_register(f, stencil_fail);
+   spe_release_register(f, depth_fail);
+   spe_release_register(f, stencil_mask);
+   if (stencil_pass != mask) {
+      spe_release_register(f, stencil_pass);
+   }
+
+   /* If all of the stencil operations were KEEP or the stencil write mask was
+    * zero, "stencil_src" will still be set to "stencil".  In this case
+    * release the "face_stencil" register.  Otherwise apply the stencil write
+    * mask to select bits from the calculated stencil value and the previous
+    * stencil value.
+    */
+   if (stencil_src == stencil) {
+      spe_release_register(f, face_stencil);
+   } else if (dsa->stencil[face].writemask != 0x0ff) {
+      int tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, dsa->stencil[face].writemask);
+      spe_selb(f, stencil_src, stencil, stencil_src, tmp);
+
+      spe_release_register(f, tmp);
+   }
+
+   return stencil_src;
+}
+
+
+void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
+{
+   struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base;
+   struct spe_function *const f = &cdsa->code;
+
+   /* This code generates a maximum of 6 (alpha test) + 3 (depth test)
+    * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
+    * up to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   /* Allocate registers for the function's input parameters.  Cleverly (and
+    * clever code is usually dangerous, but I couldn't resist) the generated
+    * function returns a structure.  Returned structures start with register
+    * 3, and the structure fields are ordered to match up exactly with the
+    * input parameters.
+    */
+   int mask = spe_allocate_register(f, 3);
+   int depth = spe_allocate_register(f, 4);
+   int stencil = spe_allocate_register(f, 5);
+   int zvals = spe_allocate_register(f, 6);
+   int frag_a = spe_allocate_register(f, 7);
+   int facing = spe_allocate_register(f, 8);
+
+   int depth_mask = spe_allocate_available_register(f);
+
+   boolean depth_complement;
+
+
+   emit_alpha_test(dsa, f, mask, frag_a);
+
+   depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals);
+
+   if (dsa->stencil[0].enabled) {
+      const int front_depth_pass = spe_allocate_available_register(f);
+      int front_stencil = emit_stencil_test(dsa, 0, f, mask,
+                                            depth_mask, depth_complement,
+                                            stencil, front_depth_pass);
+
+      if (dsa->stencil[1].enabled) {
+         const int back_depth_pass = spe_allocate_available_register(f);
+         int back_stencil = emit_stencil_test(dsa, 1, f, mask,
+                                              depth_mask,  depth_complement,
+                                              stencil, back_depth_pass);
+
+         /* If the front facing stencil value and the back facing stencil
+          * value are stored in the same register, there is no need to select
+          * a value based on the facing.  This can happen if the stencil value
+          * was not modified due to the write masks being zero, the stencil
+          * operations being KEEP, etc.
+          */
+         if (front_stencil != back_stencil) {
+            spe_selb(f, stencil, back_stencil, front_stencil, facing);
+         }
+
+         if (back_stencil != stencil) {
+            spe_release_register(f, back_stencil);
+         }
+
+         if (front_stencil != stencil) {
+            spe_release_register(f, front_stencil);
+         }
+
+         spe_selb(f, mask, back_depth_pass, front_depth_pass, facing);
+
+         spe_release_register(f, back_depth_pass);
+      } else {
+         if (front_stencil != stencil) {
+            spe_or(f, stencil, front_stencil, front_stencil);
+            spe_release_register(f, front_stencil);
+         }
+         spe_or(f, mask, front_depth_pass, front_depth_pass);
+      }
+
+      spe_release_register(f, front_depth_pass);
+   } else if (dsa->depth.enabled) {
+      if (depth_complement) {
+         spe_andc(f, mask, mask, depth_mask);
+      } else {
+         spe_and(f, mask, mask, depth_mask);
+      }
+   }
+
+   if (dsa->depth.writemask) {
+         spe_selb(f, depth, depth, zvals, mask);
+   }
+
+   spe_bi(f, 0, 0, 0);  /* return from function call */
+
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# alpha (%sabled)\n",
+             (dsa->alpha.enabled) ? "en" : "dis");
+      printf("#    func: %u\n", dsa->alpha.func);
+      printf("#    ref: %.2f\n", dsa->alpha.ref);
+
+      printf("# depth (%sabled)\n",
+             (dsa->depth.enabled) ? "en" : "dis");
+      printf("#    func: %u\n", dsa->depth.func);
+
+      for (i = 0; i < 2; i++) {
+         printf("# %s stencil (%sabled)\n",
+                (i == 0) ? "front" : "back",
+                (dsa->stencil[i].enabled) ? "en" : "dis");
+
+         printf("#    func: %u\n", dsa->stencil[i].func);
+         printf("#    op (sf, zf, zp): %u %u %u\n",
+                dsa->stencil[i].fail_op,
+                dsa->stencil[i].zfail_op,
+                dsa->stencil[i].zpass_op);
+         printf("#    ref value / value mask / write mask: %02x %02x %02x\n",
+                dsa->stencil[i].ref_value,
+                dsa->stencil[i].valuemask,
+                dsa->stencil[i].writemask);
+      }
+
+      printf("\t.text\n");
+      for (/* empty */; p < f->csr; p++) {
+         printf("\t.long\t0x%04x\n", *p);
+      }
+      fflush(stdout);
+   }
+#endif
+}
+
+
+/**
+ * \note Emits a maximum of 3 instructions
+ */
+static int
+emit_alpha_factor_calculation(struct spe_function *f,
+                              unsigned factor,
+                              int src_alpha, int dst_alpha, int const_alpha)
+{
+   int factor_reg;
+   int tmp;
+
+
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_or(f, factor_reg, src_alpha, src_alpha);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor_reg = dst_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, const_alpha);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor_reg = const_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, src_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, dst_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+      factor_reg = -1;
+      break;
+   }
+
+   return factor_reg;
+}
+
+
+/**
+ * \note Emits a maximum of 6 instructions
+ */
+static void
+emit_color_factor_calculation(struct spe_function *f,
+                              unsigned sF, unsigned mask,
+                              const int *src,
+                              const int *dst,
+                              const int *const_color,
+                              int *factor)
+{
+   int tmp;
+   unsigned i;
+
+
+   factor[0] = -1;
+   factor[1] = -1;
+   factor[2] = -1;
+   factor[3] = -1;
+
+   switch (sF) {
+   case PIPE_BLENDFACTOR_ONE:
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_or(f, factor[i], src[i], src[i]);
+         }
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_or(f, factor[0], src[3], src[3]);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor[0] = dst[3];
+      factor[1] = dst[3];
+      factor[2] = dst[3];
+      break;
+
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      factor[0] = dst[0];
+      factor[1] = dst[1];
+      factor[2] = dst[2];
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      /* Alpha saturate means min(As, 1-Ad).
+       */
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, tmp, tmp, dst[3]);
+      spe_fcgt(f, factor[0], tmp, src[3]);
+      spe_selb(f, factor[0], src[3], tmp, factor[0]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; i++) {
+         factor[i] = spe_allocate_available_register(f);
+
+         spe_fs(f, factor[i], tmp, const_color[i]);
+      }
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      for (i = 0; i < 3; i++) {
+         factor[i] = const_color[i];
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, const_color[3]);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor[0] = const_color[3];
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, src[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, src[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, dst[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, dst[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+emit_blend_calculation(struct spe_function *f,
+                       unsigned func, unsigned sF, unsigned dF,
+                       int src, int src_factor, int dst, int dst_factor)
+{
+   int tmp = spe_allocate_available_register(f);
+
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fa(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fma(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fms(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, src);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, dst, src);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, src, src_factor);
+         spe_fms(f, src, src, dst_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_MIN:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, src, dst, tmp);
+      break;
+
+   case PIPE_BLEND_MAX:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, dst, src, tmp);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   spe_release_register(f, tmp);
+}
+
+
+/**
+ * Generate code to perform alpha blending on the SPE
+ */
+void
+cell_generate_alpha_blend(struct cell_blend_state *cb)
+{
+   struct pipe_blend_state *const b = &cb->base;
+   struct spe_function *const f = &cb->code;
+
+   /* This code generates a maximum of 3 (source alpha factor)
+    * + 3 (destination alpha factor) + (3 * 6) (source color factor)
+    * + (3 * 6) (destination color factor) + (4 * 2) (blend equation)
+    * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to
+    * make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   const int frag[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+   const int pixel[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+   const int const_color[4] = {
+      spe_allocate_register(f, 11),
+      spe_allocate_register(f, 12),
+      spe_allocate_register(f, 13),
+      spe_allocate_register(f, 14),
+   };
+   unsigned func[4];
+   unsigned sF[4];
+   unsigned dF[4];
+   unsigned i;
+   int src_factor[4];
+   int dst_factor[4];
+
+
+   /* Does the selected blend mode make use of the source / destination
+    * color (RGB) blend factors?
+    */
+   boolean need_color_factor = b->blend_enable
+       && (b->rgb_func != PIPE_BLEND_MIN)
+       && (b->rgb_func != PIPE_BLEND_MAX);
+
+   /* Does the selected blend mode make use of the source / destination
+    * alpha blend factors?
+    */
+   boolean need_alpha_factor = b->blend_enable
+       && (b->alpha_func != PIPE_BLEND_MIN)
+       && (b->alpha_func != PIPE_BLEND_MAX);
+
+
+   if (b->blend_enable) {
+      sF[0] = b->rgb_src_factor;
+      sF[1] = sF[0];
+      sF[2] = sF[0];
+      switch (b->alpha_src_factor & 0x0f) {
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         sF[3] = PIPE_BLENDFACTOR_ONE;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+      case PIPE_BLENDFACTOR_DST_COLOR:
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         sF[3] = b->alpha_src_factor + 1;
+         break;
+      default:
+         sF[3] = b->alpha_src_factor;
+      }
+
+      dF[0] = b->rgb_dst_factor;
+      dF[1] = dF[0];
+      dF[2] = dF[0];
+      switch (b->alpha_dst_factor & 0x0f) {
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+      case PIPE_BLENDFACTOR_DST_COLOR:
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         dF[3] = b->alpha_dst_factor + 1;
+         break;
+      default:
+         dF[3] = b->alpha_dst_factor;
+      }
+
+      func[0] = b->rgb_func;
+      func[1] = func[0];
+      func[2] = func[0];
+      func[3] = b->alpha_func;
+   } else {
+      sF[0] = PIPE_BLENDFACTOR_ONE;
+      sF[1] = PIPE_BLENDFACTOR_ONE;
+      sF[2] = PIPE_BLENDFACTOR_ONE;
+      sF[3] = PIPE_BLENDFACTOR_ONE;
+      dF[0] = PIPE_BLENDFACTOR_ZERO;
+      dF[1] = PIPE_BLENDFACTOR_ZERO;
+      dF[2] = PIPE_BLENDFACTOR_ZERO;
+      dF[3] = PIPE_BLENDFACTOR_ZERO;
+
+      func[0] = PIPE_BLEND_ADD;
+      func[1] = PIPE_BLEND_ADD;
+      func[2] = PIPE_BLEND_ADD;
+      func[3] = PIPE_BLEND_ADD;
+   }
+
+
+   /* If alpha writing is enabled and the alpha blend mode requires use of
+    * the alpha factor, calculate the alpha factor.
+    */
+   if (((b->colormask & 8) != 0) && need_alpha_factor) {
+      src_factor[3] = emit_alpha_factor_calculation(f, sF[3], const_color[3],
+                                                    frag[3], pixel[3]);
+
+      /* If the alpha destination blend factor is the same as the alpha source
+       * blend factor, re-use the previously calculated value.
+       */
+      dst_factor[3] = (dF[3] == sF[3])
+          ? src_factor[3]
+          : emit_alpha_factor_calculation(f, dF[3], const_color[3],
+                                          frag[3], pixel[3]);
+   }
+
+
+   if (sF[0] == sF[3]) {
+      src_factor[0] = src_factor[3];
+      src_factor[1] = src_factor[3];
+      src_factor[2] = src_factor[3];
+   } else if (sF[0] == dF[3]) {
+      src_factor[0] = dst_factor[3];
+      src_factor[1] = dst_factor[3];
+      src_factor[2] = dst_factor[3];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_src_factor,
+                                    b->colormask,
+                                    frag, pixel, const_color, src_factor);
+   }
+
+
+   if (dF[0] == sF[3]) {
+      dst_factor[0] = src_factor[3];
+      dst_factor[1] = src_factor[3];
+      dst_factor[2] = src_factor[3];
+   } else if (dF[0] == dF[3]) {
+      dst_factor[0] = dst_factor[3];
+      dst_factor[1] = dst_factor[3];
+      dst_factor[2] = dst_factor[3];
+   } else if (dF[0] == sF[0]) {
+      dst_factor[0] = src_factor[0];
+      dst_factor[1] = src_factor[1];
+      dst_factor[2] = src_factor[2];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_dst_factor,
+                                    b->colormask,
+                                    frag, pixel, const_color, dst_factor);
+   }
+
+
+
+   for (i = 0; i < 4; ++i) {
+      if ((b->colormask & (1U << i)) != 0) {
+         emit_blend_calculation(f,
+                                func[i], sF[i], dF[i],
+                                frag[i], src_factor[i],
+                                pixel[i], dst_factor[i]);
+      }
+   }
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+
+      printf("# %u instructions\n", f->csr - f->store);
+      printf("# blend (%sabled)\n",
+             (cb->base.blend_enable) ? "en" : "dis");
+      printf("#    RGB func / sf / df: %u %u %u\n",
+             cb->base.rgb_func,
+             cb->base.rgb_src_factor,
+             cb->base.rgb_dst_factor);
+      printf("#    ALP func / sf / df: %u %u %u\n",
+             cb->base.alpha_func,
+             cb->base.alpha_src_factor,
+             cb->base.alpha_dst_factor);
+
+      printf("\t.text\n");
+      for (/* empty */; p < f->csr; p++) {
+         printf("\t.long\t0x%04x\n", *p);
+      }
+      fflush(stdout);
+   }
+#endif
+}
+
+
+static int
+PC_OFFSET(const struct spe_function *f, const void *d)
+{
+   const intptr_t pc = (intptr_t) &f->store[f->num_inst];
+   const intptr_t ea = ~0x0f & (intptr_t) d;
+
+   return (ea - pc) >> 2;
+}
+
+
+/**
+ * Generate code to perform color conversion and logic op
+ *
+ * \bug
+ * The code generated by this function should also perform dithering.
+ *
+ * \bug
+ * The code generated by this function should also perform color-write
+ * masking.
+ *
+ * \bug
+ * Only two framebuffer formats are supported at this time.
+ */
+void
+cell_generate_logic_op(struct spe_function *f,
+                       const struct pipe_blend_state *blend,
+                       struct pipe_surface *surf)
+{
+   const unsigned logic_op = (blend->logicop_enable)
+       ? blend->logicop_func : PIPE_LOGICOP_COPY;
+
+   /* This code generates a maximum of 37 instructions.  An additional 32
+    * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
+    * to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   /* Pixel colors in framebuffer format in AoS layout.
+    */
+   const int pixel[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+
+   /* Fragment colors stored as floats in SoA layout.
+    */
+   const int frag[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+
+   const int mask = spe_allocate_register(f, 11);
+
+
+   /* Short-circuit the noop and invert cases.
+    */
+   if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->colormask == 0)) {
+      spe_bi(f, 0, 0, 0);
+      return;
+   } else if (logic_op == PIPE_LOGICOP_INVERT) {
+      spe_nor(f, pixel[0], pixel[0], pixel[0]);
+      spe_nor(f, pixel[1], pixel[1], pixel[1]);
+      spe_nor(f, pixel[2], pixel[2], pixel[2]);
+      spe_nor(f, pixel[3], pixel[3], pixel[3]);
+      spe_bi(f, 0, 0, 0);
+      return;
+   }
+
+
+   const int tmp[4] = {
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+   };
+
+   const int shuf_xpose_hi = spe_allocate_available_register(f);
+   const int shuf_xpose_lo = spe_allocate_available_register(f);
+   const int shuf_color = spe_allocate_available_register(f);
+
+
+   /* Pointer to the begining of the function's private data area.
+    */
+   uint32_t *const data = ((uint32_t *) f->store) + (64 - 8);
+
+
+   /* Convert fragment colors to framebuffer format in AoS layout.
+    */
+   switch (surf->format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      data[0] = 0x00010203;
+      data[1] = 0x10111213;
+      data[2] = 0x04050607;
+      data[3] = 0x14151617;
+      data[4] = 0x0c000408;
+      data[5] = 0x80808080;
+      data[6] = 0x80808080;
+      data[7] = 0x80808080;
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      data[0] = 0x03020100;
+      data[1] = 0x13121110;
+      data[2] = 0x07060504;
+      data[3] = 0x17161514;
+      data[4] = 0x0804000c;
+      data[5] = 0x80808080;
+      data[6] = 0x80808080;
+      data[7] = 0x80808080;
+      break;
+   default:
+      fprintf(stderr, "CELL: Bad pixel format in cell_generate_logic_op()");
+      ASSERT(0);
+   }
+
+   spe_ilh(f, tmp[0], 0x0808);
+   spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0));
+   spe_lqr(f, shuf_color, PC_OFFSET(f, data+4));
+   spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]);
+
+   spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi);
+   spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo);
+   spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi);
+   spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo);
+
+   spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi);
+   spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo);
+   spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi);
+   spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo);
+
+   spe_cfltu(f, frag[0], frag[0], 32);
+   spe_cfltu(f, frag[1], frag[1], 32);
+   spe_cfltu(f, frag[2], frag[2], 32);
+   spe_cfltu(f, frag[3], frag[3], 32);
+
+   spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color);
+   spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color);
+   spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color);
+   spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color);
+
+
+   /* If logic op is enabled, perform the requested logical operation on the
+    * converted fragment colors and the pixel colors.
+    */
+   switch (logic_op) {
+   case PIPE_LOGICOP_CLEAR:
+      spe_il(f, frag[0], 0);
+      spe_il(f, frag[1], 0);
+      spe_il(f, frag[2], 0);
+      spe_il(f, frag[3], 0);
+      break;
+   case PIPE_LOGICOP_NOR:
+      spe_nor(f, frag[0], frag[0], pixel[0]);
+      spe_nor(f, frag[1], frag[1], pixel[1]);
+      spe_nor(f, frag[2], frag[2], pixel[2]);
+      spe_nor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      spe_andc(f, frag[0], pixel[0], frag[0]);
+      spe_andc(f, frag[1], pixel[1], frag[1]);
+      spe_andc(f, frag[2], pixel[2], frag[2]);
+      spe_andc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      spe_nor(f, frag[0], frag[0], frag[0]);
+      spe_nor(f, frag[1], frag[1], frag[1]);
+      spe_nor(f, frag[2], frag[2], frag[2]);
+      spe_nor(f, frag[3], frag[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      spe_andc(f, frag[0], frag[0], pixel[0]);
+      spe_andc(f, frag[1], frag[1], pixel[1]);
+      spe_andc(f, frag[2], frag[2], pixel[2]);
+      spe_andc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_XOR:
+      spe_xor(f, frag[0], frag[0], pixel[0]);
+      spe_xor(f, frag[1], frag[1], pixel[1]);
+      spe_xor(f, frag[2], frag[2], pixel[2]);
+      spe_xor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_NAND:
+      spe_nand(f, frag[0], frag[0], pixel[0]);
+      spe_nand(f, frag[1], frag[1], pixel[1]);
+      spe_nand(f, frag[2], frag[2], pixel[2]);
+      spe_nand(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND:
+      spe_and(f, frag[0], frag[0], pixel[0]);
+      spe_and(f, frag[1], frag[1], pixel[1]);
+      spe_and(f, frag[2], frag[2], pixel[2]);
+      spe_and(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      spe_eqv(f, frag[0], frag[0], pixel[0]);
+      spe_eqv(f, frag[1], frag[1], pixel[1]);
+      spe_eqv(f, frag[2], frag[2], pixel[2]);
+      spe_eqv(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      spe_orc(f, frag[0], pixel[0], frag[0]);
+      spe_orc(f, frag[1], pixel[1], frag[1]);
+      spe_orc(f, frag[2], pixel[2], frag[2]);
+      spe_orc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY:
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      spe_orc(f, frag[0], frag[0], pixel[0]);
+      spe_orc(f, frag[1], frag[1], pixel[1]);
+      spe_orc(f, frag[2], frag[2], pixel[2]);
+      spe_orc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR:
+      spe_or(f, frag[0], frag[0], pixel[0]);
+      spe_or(f, frag[1], frag[1], pixel[1]);
+      spe_or(f, frag[2], frag[2], pixel[2]);
+      spe_or(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_SET:
+      spe_il(f, frag[0], ~0);
+      spe_il(f, frag[1], ~0);
+      spe_il(f, frag[2], ~0);
+      spe_il(f, frag[3], ~0);
+      break;
+
+   /* These two cases are short-circuited above.
+    */
+   case PIPE_LOGICOP_INVERT:
+   case PIPE_LOGICOP_NOOP:
+   default:
+      assert(0);
+   }
+
+
+   /* Apply fragment mask.
+    */
+   spe_ilh(f, tmp[0], 0x0000);
+   spe_ilh(f, tmp[1], 0x0404);
+   spe_ilh(f, tmp[2], 0x0808);
+   spe_ilh(f, tmp[3], 0x0c0c);
+
+   spe_shufb(f, tmp[0], mask, mask, tmp[0]);
+   spe_shufb(f, tmp[1], mask, mask, tmp[1]);
+   spe_shufb(f, tmp[2], mask, mask, tmp[2]);
+   spe_shufb(f, tmp[3], mask, mask, tmp[3]);
+
+   spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]);
+   spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]);
+   spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]);
+   spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]);
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# %u instructions\n", f->csr - f->store);
+
+      printf("\t.text\n");
+      for (i = 0; i < 64; i++) {
+         printf("\t.long\t0x%04x\n", p[i]);
+      }
+      fflush(stdout);
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
new file mode 100644
index 0000000000..a8267a5133
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -0,0 +1,39 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CELL_STATE_PER_FRAGMENT_H
+#define CELL_STATE_PER_FRAGMENT_H
+
+extern void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
+
+extern void
+cell_generate_alpha_blend(struct cell_blend_state *cb);
+
+extern void
+cell_generate_logic_op(struct spe_function *f,
+                       const struct pipe_blend_state *blend,
+                       struct pipe_surface *surf);
+
+#endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
new file mode 100644
index 0000000000..bf517ea563
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -0,0 +1,219 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "cell_context.h"
+#include "cell_state.h"
+#include "cell_gen_fp.h"
+
+
+/** cast wrapper */
+static INLINE struct cell_fragment_shader_state *
+cell_fragment_shader_state(void *shader)
+{
+   return (struct cell_fragment_shader_state *) shader;
+}
+
+
+/** cast wrapper */
+static INLINE struct cell_vertex_shader_state *
+cell_vertex_shader_state(void *shader)
+{
+   return (struct cell_vertex_shader_state *) shader;
+}
+
+
+/**
+ * Create fragment shader state.
+ * Called via pipe->create_fs_state()
+ */
+static void *
+cell_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_fragment_shader_state *cfs;
+
+   cfs = CALLOC_STRUCT(cell_fragment_shader_state);
+   if (!cfs)
+      return NULL;
+
+   cfs->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!cfs->shader.tokens) {
+      FREE(cfs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(templ->tokens, &cfs->info);
+
+   cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code);
+
+   return cfs;
+}
+
+
+/**
+ * Called via pipe->bind_fs_state()
+ */
+static void
+cell_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->fs = cell_fragment_shader_state(fs);
+
+   cell->dirty |= CELL_NEW_FS;
+}
+
+
+/**
+ * Called via pipe->delete_fs_state()
+ */
+static void
+cell_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs);
+
+   spe_release_func(&cfs->code);
+
+   FREE((void *) cfs->shader.tokens);
+   FREE(cfs);
+}
+
+
+/**
+ * Create vertex shader state.
+ * Called via pipe->create_vs_state()
+ */
+static void *
+cell_create_vs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_vertex_shader_state *cvs;
+
+   cvs = CALLOC_STRUCT(cell_vertex_shader_state);
+   if (!cvs)
+      return NULL;
+
+   cvs->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!cvs->shader.tokens) {
+      FREE(cvs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(templ->tokens, &cvs->info);
+
+   cvs->draw_data = draw_create_vertex_shader(cell->draw, &cvs->shader);
+   if (cvs->draw_data == NULL) {
+      FREE( (void *) cvs->shader.tokens );
+      FREE( cvs );
+      return NULL;
+   }
+
+   return cvs;
+}
+
+
+/**
+ * Called via pipe->bind_vs_state()
+ */
+static void
+cell_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->vs = cell_vertex_shader_state(vs);
+
+   draw_bind_vertex_shader(cell->draw,
+                           (cell->vs ? cell->vs->draw_data : NULL));
+
+   cell->dirty |= CELL_NEW_VS;
+}
+
+
+/**
+ * Called via pipe->delete_vs_state()
+ */
+static void
+cell_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_vertex_shader_state *cvs = cell_vertex_shader_state(vs);
+
+   draw_delete_vertex_shader(cell->draw, cvs->draw_data);
+   FREE( (void *) cvs->shader.tokens );
+   FREE( cvs );
+}
+
+
+/**
+ * Called via pipe->set_constant_buffer()
+ */
+static void
+cell_set_constant_buffer(struct pipe_context *pipe,
+                         uint shader, uint index,
+                         const struct pipe_constant_buffer *buf)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   draw_flush(cell->draw);
+
+   /* note: reference counting */
+   pipe_buffer_reference(pipe->screen,
+                         &cell->constants[shader].buffer,
+                         buf->buffer);
+
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
+}
+
+
+void
+cell_init_shader_functions(struct cell_context *cell)
+{
+   cell->pipe.create_fs_state = cell_create_fs_state;
+   cell->pipe.bind_fs_state   = cell_bind_fs_state;
+   cell->pipe.delete_fs_state = cell_delete_fs_state;
+
+   cell->pipe.create_vs_state = cell_create_vs_state;
+   cell->pipe.bind_vs_state   = cell_bind_vs_state;
+   cell->pipe.delete_vs_state = cell_delete_vs_state;
+
+   cell->pipe.set_constant_buffer = cell_set_constant_buffer;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
new file mode 100644
index 0000000000..fbe55c8472
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "cell_context.h"
+#include "cell_state.h"
+
+#include "draw/draw_context.h"
+
+
+static void
+cell_set_vertex_elements(struct pipe_context *pipe,
+                         unsigned count,
+                         const struct pipe_vertex_element *elements)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(cell->vertex_element, elements, count * sizeof(elements[0]));
+   cell->num_vertex_elements = count;
+
+   cell->dirty |= CELL_NEW_VERTEX;
+
+   draw_set_vertex_elements(cell->draw, count, elements);
+}
+
+
+static void
+cell_set_vertex_buffers(struct pipe_context *pipe,
+                        unsigned count,
+                        const struct pipe_vertex_buffer *buffers)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(cell->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   cell->num_vertex_buffers = count;
+
+   cell->dirty |= CELL_NEW_VERTEX;
+
+   draw_set_vertex_buffers(cell->draw, count, buffers);
+}
+
+
+void
+cell_init_vertex_functions(struct cell_context *cell)
+{
+   cell->pipe.set_vertex_buffers = cell_set_vertex_buffers;
+   cell->pipe.set_vertex_elements = cell_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
new file mode 100644
index 0000000000..c9203fee08
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_rect.h"
+#include "cell_context.h"
+#include "cell_surface.h"
+
+
+void
+cell_init_surface_functions(struct cell_context *cell)
+{
+   cell->pipe.surface_copy = util_surface_copy;
+   cell->pipe.surface_fill = util_surface_fill;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.h b/src/gallium/drivers/cell/ppu/cell_surface.h
new file mode 100644
index 0000000000..9e58f32944
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef CELL_SURFACE_H
+#define CELL_SURFACE_H
+
+
+struct cell_context;
+
+
+extern void
+cell_init_surface_functions(struct cell_context *cell);
+
+
+#endif /* SP_SURFACE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
new file mode 100644
index 0000000000..9ba995ab7d
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -0,0 +1,531 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "cell_context.h"
+#include "cell_state.h"
+#include "cell_texture.h"
+
+
+
+static unsigned
+minify(unsigned d)
+{
+   return MAX2(1, d>>1);
+}
+
+
+static void
+cell_texture_layout(struct cell_texture *ct)
+{
+   struct pipe_texture *pt = &ct->base;
+   unsigned level;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned depth = pt->depth[0];
+
+   ct->buffer_size = 0;
+
+   for ( level = 0 ; level <= pt->last_level ; level++ ) {
+      unsigned size;
+      unsigned w_tile, h_tile;
+
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
+      /* width, height, rounded up to tile size */
+      w_tile = align(width, TILE_SIZE);
+      h_tile = align(height, TILE_SIZE);
+
+      pt->width[level] = width;
+      pt->height[level] = height;
+      pt->depth[level] = depth;
+      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
+      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
+
+      ct->stride[level] = pt->nblocksx[level] * pt->block.size;
+
+      ct->level_offset[level] = ct->buffer_size;
+
+      size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
+      if (pt->target == PIPE_TEXTURE_CUBE)
+         size *= 6;
+      else
+         size *= depth;
+
+      ct->buffer_size += size;
+
+      width  = minify(width);
+      height = minify(height);
+      depth = minify(depth);
+   }
+}
+
+
+static struct pipe_texture *
+cell_texture_create(struct pipe_screen *screen,
+                    const struct pipe_texture *templat)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct cell_texture *ct = CALLOC_STRUCT(cell_texture);
+   if (!ct)
+      return NULL;
+
+   ct->base = *templat;
+   ct->base.refcount = 1;
+   ct->base.screen = screen;
+
+   cell_texture_layout(ct);
+
+   ct->buffer = ws->buffer_create(ws, 32, PIPE_BUFFER_USAGE_PIXEL,
+                                   ct->buffer_size);
+
+   if (!ct->buffer) {
+      FREE(ct);
+      return NULL;
+   }
+
+   return &ct->base;
+}
+
+
+static void
+cell_texture_release(struct pipe_screen *screen,
+                     struct pipe_texture **pt)
+{
+   if (!*pt)
+      return;
+
+   /*
+   DBG("%s %p refcount will be %d\n",
+       __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
+   */
+   if (--(*pt)->refcount <= 0) {
+      /* Delete this texture now.
+       * But note that the underlying pipe_buffer may linger...
+       */
+      struct cell_texture *ct = cell_texture(*pt);
+      uint i;
+
+      /*
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
+      */
+
+      pipe_buffer_reference(screen, &ct->buffer, NULL);
+
+      for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+         /* Unreference the tiled image buffer.
+          * It may not actually be deleted until a fence is hit.
+          */
+         if (ct->tiled_buffer[i]) {
+            ct->tiled_mapped[i] = NULL;
+            pipe_buffer_reference(screen, &ct->tiled_buffer[i], NULL);
+         }
+      }
+
+      FREE(ct);
+   }
+   *pt = NULL;
+}
+
+
+
+/**
+ * Convert image from linear layout to tiled layout.  4-byte pixels.
+ */
+static void
+twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                   uint src_stride, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   src_stride /= 4; /* convert from bytes to pixels */
+
+   /* loop over dest tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of dest tile: */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               ASSERT(srci < h);
+               ASSERT(srcj < w);
+               tdst[i * tile_size + j] = src[srci * src_stride + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+         tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+         tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+         tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
+      }
+   }
+}
+
+
+/**
+ * Convert image from tiled layout to linear layout.  4-byte pixels.
+ */
+static void
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                     uint dst_stride, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+   uint *tile_buf;
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   dst_stride /= 4; /* convert from bytes to pixels */
+
+   tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+   
+   /* loop over src tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of src tile: */
+         const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+         
+         twiddle_tile(tsrc, tile_buf);
+         tsrc = tile_buf;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               uint dsti = it * tile_size + i;
+               uint dstj = jt * tile_size + j;
+               ASSERT(dsti < h);
+               ASSERT(dstj < w);
+               dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
+            }
+         }
+      }
+   }
+
+   align_free(tile_buf);
+}
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ */
+static void
+cell_twiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const uint bufWidth = align(texWidth, TILE_SIZE);
+   const uint bufHeight = align(texHeight, TILE_SIZE);
+   const void *map = screen->surface_map(screen, surface, PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) map;
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = bufWidth * bufHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->tiled_buffer[level]) {
+            /* allocate buffer for tiled data now */
+            struct pipe_winsys *ws = screen->winsys;
+            uint bytes = bufWidth * bufHeight * 4 * numFaces;
+            ct->tiled_buffer[level] =
+               ws->buffer_create(ws, 16, PIPE_BUFFER_USAGE_PIXEL, bytes);
+            /* and map it */
+            ct->tiled_mapped[level] =
+               ws->buffer_map(ws, ct->tiled_buffer[level],
+                              PIPE_BUFFER_USAGE_GPU_READ);
+         }
+         dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
+
+         twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                            surface->stride, src);
+      }
+      break;
+   default:
+      printf("Cell: twiddle unsupported texture format %s\n",
+             pf_name(ct->base.format));
+   }
+
+   screen->surface_unmap(screen, surface);
+}
+
+
+/**
+ * Convert SPU tiled texture image data to linear format for app usage.
+ */
+static void
+cell_untwiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const void *map = screen->surface_map(screen, surface, PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = surface->stride * texHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->untiled_data[level]) {
+            ct->untiled_data[level] =
+               align_malloc(surface->stride * texHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset);
+
+         untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                              surface->stride, src);
+      }
+      break;
+   default:
+      {
+         ct->untiled_data[level] = NULL;
+         printf("Cell: untwiddle unsupported texture format %s\n",
+                pf_name(ct->base.format));
+      }
+   }
+
+   screen->surface_unmap(screen, surface);
+}
+
+
+static struct pipe_surface *
+cell_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned usage)
+{
+   struct cell_texture *ct = cell_texture(pt);
+   struct pipe_surface *ps;
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      ps->refcount = 1;
+      pipe_texture_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->block = pt->block;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = ct->stride[level];
+      ps->offset = ct->level_offset[level];
+      ps->usage = usage;
+
+      /* XXX may need to override usage flags (see sp_texture.c) */
+
+      pipe_texture_reference(&ps->texture, pt); 
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+
+      if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
+         ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+            ps->nblocksy *
+            ps->stride;
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+
+      if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+         /* convert from tiled to linear layout */
+         cell_untwiddle_texture(screen, ps);
+      }
+   }
+   return ps;
+}
+
+
+static void 
+cell_tex_surface_release(struct pipe_screen *screen, 
+                         struct pipe_surface **s)
+{
+   struct cell_texture *ct = cell_texture((*s)->texture);
+   const uint level = (*s)->level;
+   struct pipe_surface *surf = *s;
+
+   if ((surf->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+   {
+      align_free(ct->untiled_data[level]);
+      ct->untiled_data[level] = NULL;
+   }
+
+   if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
+       (surf->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
+      /* convert from linear to tiled layout */
+      cell_twiddle_texture(screen, surf);
+   }
+
+   /* XXX if done rendering to teximage, re-tile */
+
+   if (--surf->refcount == 0) {
+      pipe_texture_reference(&surf->texture, NULL);
+      FREE(surf);
+   }
+   *s = NULL;
+}
+
+
+static void *
+cell_surface_map(struct pipe_screen *screen,
+                 struct pipe_surface *surface,
+                 unsigned flags)
+{
+   ubyte *map;
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+
+   assert(ct);
+
+#if 0
+   if (flags & ~surface->usage) {
+      assert(0);
+      return NULL;
+   }
+#endif
+
+   map = pipe_buffer_map( screen, ct->buffer, flags );
+   if (map == NULL) {
+      return NULL;
+   }
+   else {
+      if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) &&
+          (ct->untiled_data[level])) {
+         return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
+      }
+      else {
+         return (void *) (map + surface->offset);
+      }
+   }
+}
+
+
+static void
+cell_surface_unmap(struct pipe_screen *screen,
+                   struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+
+   assert(ct);
+
+   pipe_buffer_unmap( screen, ct->buffer );
+}
+
+
+
+void
+cell_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->texture_create = cell_texture_create;
+   screen->texture_release = cell_texture_release;
+
+   screen->get_tex_surface = cell_get_tex_surface;
+   screen->tex_surface_release = cell_tex_surface_release;
+
+   screen->surface_map = cell_surface_map;
+   screen->surface_unmap = cell_surface_unmap;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
new file mode 100644
index 0000000000..7018b0c9bf
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -0,0 +1,72 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_TEXTURE_H
+#define CELL_TEXTURE_H
+
+
+struct cell_context;
+struct pipe_texture;
+
+
+/**
+ * Subclass of pipe_texture
+ */
+struct cell_texture
+{
+   struct pipe_texture base;
+
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+   unsigned long buffer_size;
+
+   /** Texture data in tiled layout is held here */
+   struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+   /** Mapped, tiled texture data */
+   void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
+   void *untiled_data[CELL_MAX_TEXTURE_LEVELS];
+};
+
+
+/** cast wrapper */
+static INLINE struct cell_texture *
+cell_texture(struct pipe_texture *pt)
+{
+   return (struct cell_texture *) pt;
+}
+
+
+
+extern void
+cell_init_screen_texture_funcs(struct pipe_screen *screen);
+
+
+#endif /* CELL_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
new file mode 100644
index 0000000000..ab54e79689
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -0,0 +1,309 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Vertex buffer code.  The draw module transforms vertices to window
+ * coords, etc. and emits the vertices into buffer supplied by this module.
+ * When a vertex buffer is full, or we flush, we'll send the vertex data
+ * to the SPUs.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "cell_batch.h"
+#include "cell_context.h"
+#include "cell_fence.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_vbuf.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
+
+
+/** Allow vertex data to be inlined after RENDER command */
+#define ALLOW_INLINE_VERTS 1
+
+
+/**
+ * Subclass of vbuf_render because we need a cell_context pointer in
+ * a few places.
+ */
+struct cell_vbuf_render
+{
+   struct vbuf_render base;
+   struct cell_context *cell;
+   uint prim;            /**< PIPE_PRIM_x */
+   uint vertex_size;     /**< in bytes */
+   void *vertex_buffer;  /**< just for debug, really */
+   uint vertex_buf;      /**< in [0, CELL_NUM_BUFFERS-1] */
+};
+
+
+/** cast wrapper */
+static struct cell_vbuf_render *
+cell_vbuf_render(struct vbuf_render *vbr)
+{
+   return (struct cell_vbuf_render *) vbr;
+}
+
+
+
+static const struct vertex_info *
+cell_vbuf_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   return &cvbr->cell->vertex_info;
+}
+
+
+static void *
+cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
+                            ushort vertex_size, ushort nr_vertices)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
+
+   assert(cvbr->vertex_buf == ~0);
+   cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
+   cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
+   cvbr->vertex_size = vertex_size;
+   return cvbr->vertex_buffer;
+}
+
+
+static void
+cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
+                           unsigned vertex_size, unsigned vertices_used)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+
+   /*
+   printf("%s vertex_buf = %u  count = %u\n",
+          __FUNCTION__, cvbr->vertex_buf, vertices_used);
+   */
+
+   /* Make sure texture buffers aren't released until we're done rendering
+    * with them.
+    */
+   cell_add_fenced_textures(cell);
+
+   /* Tell SPUs they can release the vert buf */
+   if (cvbr->vertex_buf != ~0U) {
+      STATIC_ASSERT(sizeof(struct cell_command_release_verts) % 16 == 0);
+      struct cell_command_release_verts *release
+         = (struct cell_command_release_verts *)
+         cell_batch_alloc16(cell, sizeof(struct cell_command_release_verts));
+      release->opcode[0] = CELL_CMD_RELEASE_VERTS;
+      release->vertex_buf = cvbr->vertex_buf;
+   }
+
+   cvbr->vertex_buf = ~0;
+   cell_flush_int(cell, 0x0);
+
+   assert(vertices == cvbr->vertex_buffer);
+   cvbr->vertex_buffer = NULL;
+}
+
+
+
+static boolean
+cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   cvbr->prim = prim;
+   /*printf("cell_set_prim %u\n", prim);*/
+   return TRUE;
+}
+
+
+static void
+cell_vbuf_draw(struct vbuf_render *vbr,
+	       const ushort *indices,
+               uint nr_indices)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+   float xmin, ymin, xmax, ymax;
+   uint i;
+   uint nr_vertices = 0, min_index = ~0;
+   const void *vertices = cvbr->vertex_buffer;
+   const uint vertex_size = cvbr->vertex_size;
+
+   for (i = 0; i < nr_indices; i++) {
+      if (indices[i] > nr_vertices)
+         nr_vertices = indices[i];
+      if (indices[i] < min_index)
+         min_index = indices[i];
+   }
+   nr_vertices++;
+
+#if 0
+   /*if (min_index > 0)*/
+      printf("%s min_index = %u\n", __FUNCTION__, min_index);
+#endif
+
+#if 0
+   printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n",
+          nr_indices, nr_vertices);
+   printf("  ");
+   for (i = 0; i < nr_indices; i += 3) {
+      printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
+   }
+   printf("\n");
+#elif 0
+   printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
+          nr_indices, nr_vertices,
+          indices[0], indices[1], indices[2]);
+   printf("ind space = %u, vert space = %u, space = %u\n",
+          nr_indices * 2,
+          nr_vertices * 4 * cell->vertex_info.size,
+          cell_batch_free_space(cell));
+#endif
+
+   /* compute x/y bounding box */
+   xmin = ymin = 1e50;
+   xmax = ymax = -1e50;
+   for (i = min_index; i < nr_vertices; i++) {
+      const float *v = (float *) ((ubyte *) vertices + i * vertex_size);
+      if (v[0] < xmin)
+         xmin = v[0];
+      if (v[0] > xmax)
+         xmax = v[0];
+      if (v[1] < ymin)
+         ymin = v[1];
+      if (v[1] > ymax)
+         ymax = v[1];
+   }
+#if 0
+   printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax);
+   fflush(stdout);
+#endif
+
+   if (cvbr->prim != PIPE_PRIM_TRIANGLES)
+      return; /* only render tris for now */
+
+   /* build/insert batch RENDER command */
+   {
+      const uint index_bytes = ROUNDUP16(nr_indices * 2);
+      const uint vertex_bytes = ROUNDUP16(nr_vertices * 4 * cell->vertex_info.size);
+      STATIC_ASSERT(sizeof(struct cell_command_render) % 16 == 0);
+      const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
+
+      struct cell_command_render *render
+         = (struct cell_command_render *)
+         cell_batch_alloc16(cell, batch_size);
+
+      render->opcode[0] = CELL_CMD_RENDER;
+      render->prim_type = cvbr->prim;
+
+      render->num_indexes = nr_indices;
+      render->min_index = min_index;
+
+      /* append indices after render command */
+      memcpy(render + 1, indices, nr_indices * 2);
+
+      /* if there's room, append vertices after the indices, else leave
+       * vertices in the original/separate buffer.
+       */
+      render->vertex_size = 4 * cell->vertex_info.size;
+      render->num_verts = nr_vertices;
+      if (ALLOW_INLINE_VERTS &&
+          min_index == 0 &&
+          vertex_bytes + 16 <= cell_batch_free_space(cell)) {
+         /* vertex data inlined, after indices, at 16-byte boundary */
+         void *dst = cell_batch_alloc16(cell, vertex_bytes);
+         memcpy(dst, vertices, vertex_bytes);
+         render->inline_verts = TRUE;
+         render->vertex_buf = ~0;
+      }
+      else {
+         /* vertex data in separate buffer */
+         render->inline_verts = FALSE;
+         ASSERT(cvbr->vertex_buf >= 0);
+         render->vertex_buf = cvbr->vertex_buf;
+      }
+
+      render->xmin = xmin;
+      render->ymin = ymin;
+      render->xmax = xmax;
+      render->ymax = ymax;
+   }
+
+#if 0
+   /* helpful for debug */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+#endif
+}
+
+
+static void
+cell_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   cvbr->cell->vbuf_render = NULL;
+   FREE(cvbr);
+}
+
+
+/**
+ * Initialize the post-transform vertex buffer information for the given
+ * context.
+ */
+void
+cell_init_vbuf(struct cell_context *cell)
+{
+   assert(cell->draw);
+
+   cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
+
+   /* The max number of indexes is what can fix into a batch buffer,
+    * minus the render and release-verts commands.
+    */
+   cell->vbuf_render->base.max_indices
+      = (CELL_BUFFER_SIZE
+         - sizeof(struct cell_command_render)
+         - sizeof(struct cell_command_release_verts))
+      / sizeof(ushort);
+   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
+
+   cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
+   cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
+   cell->vbuf_render->base.set_primitive = cell_vbuf_set_primitive;
+   cell->vbuf_render->base.draw = cell_vbuf_draw;
+   cell->vbuf_render->base.release_vertices = cell_vbuf_release_vertices;
+   cell->vbuf_render->base.destroy = cell_vbuf_destroy;
+
+   cell->vbuf_render->cell = cell;
+#if 1
+   cell->vbuf_render->vertex_buf = ~0;
+#endif
+
+   cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base);
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.h b/src/gallium/drivers/cell/ppu/cell_vbuf.h
new file mode 100644
index 0000000000..d265cbf770
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_VBUF_H
+#define CELL_VBUF_H
+
+
+struct cell_context;
+
+extern void
+cell_init_vbuf(struct cell_context *cell);
+
+
+#endif /* CELL_VBUF_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
new file mode 100644
index 0000000000..9cba537d9e
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -0,0 +1,346 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <inttypes.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+
+#include "../auxiliary/draw/draw_context.h"
+#include "../auxiliary/draw/draw_private.h"
+
+#include "cell_context.h"
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Emit a 4x4 matrix transpose operation
+ *
+ * \param p         Function that the transpose operation is to be appended to
+ * \param row0      Register containing row 0 of the source matrix
+ * \param row1      Register containing row 1 of the source matrix
+ * \param row2      Register containing row 2 of the source matrix
+ * \param row3      Register containing row 3 of the source matrix
+ * \param dest_ptr  Register containing the address of the destination matrix
+ * \param shuf_ptr  Register containing the address of the shuffled data
+ * \param count     Number of colums to actually be written to the destination
+ *
+ * \note
+ * This function assumes that the registers named by \c row0, \c row1,
+ * \c row2, and \c row3 are scratch and can be modified by the generated code.
+ * Furthermore, these registers will be released, via calls to
+ * \c release_register, by this function.
+ * 
+ * \note
+ * This function requires that four temporary are available on entry.
+ */
+static void
+emit_matrix_transpose(struct spe_function *p,
+		      unsigned row0, unsigned row1, unsigned row2,
+		      unsigned row3, unsigned dest_ptr,
+		      unsigned shuf_ptr, unsigned count)
+{
+   int shuf_hi = spe_allocate_available_register(p);
+   int shuf_lo = spe_allocate_available_register(p);
+   int t1 = spe_allocate_available_register(p);
+   int t2 = spe_allocate_available_register(p);
+   int t3;
+   int t4;
+   int col0;
+   int col1;
+   int col2;
+   int col3;
+
+
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
+   spe_shufb(p, t1, row0, row2, shuf_hi);
+   spe_shufb(p, t2, row0, row2, shuf_lo);
+
+
+   /* row0 and row2 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   t3 = row0;
+   t4 = row2;
+
+   spe_shufb(p, t3, row1, row3, shuf_hi);
+   spe_shufb(p, t4, row1, row3, shuf_lo);
+
+
+   /* row1 and row3 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   col0 = row1;
+   col1 = row3;
+
+   spe_shufb(p, col0, t1, t3, shuf_hi);
+   if (count > 1) {
+      spe_shufb(p, col1, t1, t3, shuf_lo);
+   }
+
+   /* t1 and t3 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   col2 = t1;
+   col3 = t3;
+
+   if (count > 2) {
+      spe_shufb(p, col2, t2, t4, shuf_hi);
+   }
+
+   if (count > 3) {
+      spe_shufb(p, col3, t2, t4, shuf_lo);
+   }
+
+
+   /* Store the results.  Remember that the stqd instruction is encoded using
+    * the qword offset (stand-alone assemblers to the byte-offset to
+    * qword-offset conversion for you), so the byte-offset needs be divided by
+    * 16.
+    */
+   switch (count) {
+   case 4:
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
+   case 3:
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
+   case 2:
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
+   case 1:
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
+   }
+
+
+   /* Release all of the temporary registers used.
+    */
+   spe_release_register(p, col0);
+   spe_release_register(p, col1);
+   spe_release_register(p, col2);
+   spe_release_register(p, col3);
+   spe_release_register(p, shuf_hi);
+   spe_release_register(p, shuf_lo);
+   spe_release_register(p, t2);
+   spe_release_register(p, t4);
+}
+
+
+#if 0
+/* This appears to not be used currently */
+static void
+emit_fetch(struct spe_function *p,
+	   unsigned in_ptr, unsigned *offset,
+	   unsigned out_ptr, unsigned shuf_ptr,
+	   enum pipe_format format)
+{
+   const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0)
+       + (pf_size_z(format) != 0) + (pf_size_w(format) != 0);
+   const unsigned type = pf_type(format);
+   const unsigned bytes = pf_size_x(format);
+
+   int v0 = spe_allocate_available_register(p);
+   int v1 = spe_allocate_available_register(p);
+   int v2 = spe_allocate_available_register(p);
+   int v3 = spe_allocate_available_register(p);
+   int tmp = spe_allocate_available_register(p);
+   int float_zero = -1;
+   int float_one = -1;
+   float scale_signed = 0.0;
+   float scale_unsigned = 0.0;
+
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
+   offset[0] += 4;
+   
+   switch (bytes) {
+   case 1:
+      scale_signed = 1.0f / 127.0f;
+      scale_unsigned = 1.0f / 255.0f;
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
+      spe_shufb(p, v0, v0, v0, tmp);
+      spe_shufb(p, v1, v1, v1, tmp);
+      spe_shufb(p, v2, v2, v2, tmp);
+      spe_shufb(p, v3, v3, v3, tmp);
+      break;
+   case 2:
+      scale_signed = 1.0f / 32767.0f;
+      scale_unsigned = 1.0f / 65535.0f;
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
+      spe_shufb(p, v0, v0, v0, tmp);
+      spe_shufb(p, v1, v1, v1, tmp);
+      spe_shufb(p, v2, v2, v2, tmp);
+      spe_shufb(p, v3, v3, v3, tmp);
+      break;
+   case 4:
+      scale_signed = 1.0f / 2147483647.0f;
+      scale_unsigned = 1.0f / 4294967295.0f;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   switch (type) {
+   case PIPE_FORMAT_TYPE_FLOAT:
+      break;
+   case PIPE_FORMAT_TYPE_UNORM:
+      spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16);
+      spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff);
+      spe_cuflt(p, v0, v0, 0);
+      spe_fm(p, v0, v0, tmp);
+      break;
+   case PIPE_FORMAT_TYPE_SNORM:
+      spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16);
+      spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff);
+      spe_csflt(p, v0, v0, 0);
+      spe_fm(p, v0, v0, tmp);
+      break;
+   case PIPE_FORMAT_TYPE_USCALED:
+      spe_cuflt(p, v0, v0, 0);
+      break;
+   case PIPE_FORMAT_TYPE_SSCALED:
+      spe_csflt(p, v0, v0, 0);
+      break;
+   }
+
+
+   if (count < 4) {
+      float_one = spe_allocate_available_register(p);
+      spe_il(p, float_one, 1);
+      spe_cuflt(p, float_one, float_one, 0);
+      
+      if (count < 3) {
+	 float_zero = spe_allocate_available_register(p);
+	 spe_il(p, float_zero, 0);
+      }
+   }
+
+   spe_release_register(p, tmp);
+
+   emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
+
+   switch (count) {
+   case 1:
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
+   case 2:
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
+   case 3:
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
+   }
+
+   if (float_zero != -1) {
+      spe_release_register(p, float_zero);
+   }
+
+   if (float_one != -1) {
+      spe_release_register(p, float_one);
+   }
+}
+#endif
+
+
+void cell_update_vertex_fetch(struct draw_context *draw)
+{
+#if 0
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct spe_function *p = &cell->attrib_fetch;
+   unsigned function_index[PIPE_MAX_ATTRIBS];
+   unsigned unique_attr_formats;
+   int out_ptr;
+   int in_ptr;
+   int shuf_ptr;
+   unsigned i;
+   unsigned j;
+
+
+   /* Determine how many unique input attribute formats there are.  At the
+    * same time, store the index of the lowest numbered attribute that has
+    * the same format as any non-unique format.
+    */
+   unique_attr_formats = 1;
+   function_index[0] = 0;
+   for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format curr_fmt = draw->vertex_element[i].src_format;
+
+      for (j = 0; j < i; j++) {
+	 if (curr_fmt == draw->vertex_element[j].src_format) {
+	    break;
+	 }
+      }
+      
+      if (j == i) {
+	 unique_attr_formats++;
+      }
+
+      function_index[i] = j;
+   }
+
+
+   /* Each fetch function can be a maximum of 34 instructions (note: this is
+    * actually a slight over-estimate).
+    */
+   spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);
+
+
+   /* Allocate registers for the function's input parameters.
+    */
+   out_ptr = spe_allocate_register(p, 3);
+   in_ptr = spe_allocate_register(p, 4);
+   shuf_ptr = spe_allocate_register(p, 5);
+
+
+   /* Generate code for the individual attribute fetch functions.
+    */
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      unsigned offset;
+
+      if (function_index[i] == i) {
+	 cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr 
+						     - (void *) p->store);
+
+	 offset = 0;
+	 emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr,
+		    draw->vertex_element[i].src_format);
+	 spe_bi(p, 0, 0, 0);
+
+	 /* Round up to the next 16-byte boundary.
+	  */
+	 if ((((unsigned) p->store) & 0x0f) != 0) {
+	    const unsigned align = ((unsigned) p->store) & 0x0f;
+	    p->store = (uint32_t *) (((void *) p->store) + align);
+	 }
+      } else {
+	 /* Use the same function entry-point as a previously seen attribute
+	  * with the same format.
+	  */
+	 cell->attrib_fetch_offsets[i] = 
+	     cell->attrib_fetch_offsets[function_index[i]];
+      }
+   }
+#else
+   assert(0);
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
new file mode 100644
index 0000000000..403cf6d50f
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -0,0 +1,146 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file cell_vertex_shader.c
+ * Vertex shader interface routines for Cell.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_batch.h"
+
+#include "cell/common.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+cell_vertex_shader_queue_flush(struct draw_context *draw)
+{
+#if 0
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct cell_command_vs *const vs = &cell_global.command[0].vs;
+   uint64_t *batch;
+   struct cell_array_info *array_info;
+   unsigned i, j;
+   struct cell_attribute_fetch_code *cf;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch(draw);
+   cell_update_vertex_fetch(draw);
+
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*cf));
+   batch[0] = CELL_CMD_STATE_ATTRIB_FETCH;
+   cf = (struct cell_attribute_fetch_code *) (&batch[1]);
+   cf->base = (uint64_t) cell->attrib_fetch.store;
+   cf->size = ROUNDUP16((unsigned)((void *) cell->attrib_fetch.csr 
+				   - (void *) cell->attrib_fetch.store));
+
+
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format format = draw->vertex_element[i].src_format;
+      const unsigned count = ((pf_size_x(format) != 0)
+			      + (pf_size_y(format) != 0)
+			      + (pf_size_z(format) != 0)
+			      + (pf_size_w(format) != 0));
+      const unsigned size = pf_size_x(format) * count;
+
+      batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
+
+      batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
+
+      array_info = (struct cell_array_info *) &batch[1];
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info->attr = i;
+      array_info->pitch = draw->vertex_fetch.pitch[i];
+      array_info->size = size;
+      array_info->function_offset = cell->attrib_fetch_offsets[i];
+   }
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0])
+                            + sizeof(struct pipe_viewport_state));
+   batch[0] = CELL_CMD_STATE_VIEWPORT;
+   (void) memcpy(&batch[1], &draw->viewport,
+                 sizeof(struct pipe_viewport_state));
+
+   {
+      uint64_t uniforms = (uintptr_t) draw->user.constants;
+
+      batch = cell_batch_alloc(cell, 2 *sizeof(batch[0]));
+      batch[0] = CELL_CMD_STATE_UNIFORMS;
+      batch[1] = uniforms;
+   }
+
+   cell_batch_flush(cell);
+
+   vs->opcode = CELL_CMD_VS_EXECUTE;
+   vs->nr_attrs = draw->vertex_fetch.nr_attrs;
+
+   (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane));
+   vs->nr_planes = draw->nr_planes;
+
+   for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) {
+      const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         vs->elts[j] = draw->vs.queue[i + j].elt;
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex;
+      }
+
+      for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) {
+         vs->elts[j] = vs->elts[0];
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex;
+      }
+
+      vs->num_elts = n;
+      send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
+
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+   }
+
+   draw->vs.post_nr = draw->vs.queue_nr;
+   draw->vs.queue_nr = 0;
+#else
+   assert(0);
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.h b/src/gallium/drivers/cell/ppu/cell_winsys.h
new file mode 100644
index 0000000000..ae2af5696b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_winsys.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_WINSYS_H
+#define CELL_WINSYS_H
+
+#include "pipe/p_compiler.h"
+
+
+/**
+ * Very simple winsys at this time.
+ * Will probably eventually add SPU control info.
+ */
+struct cell_winsys
+{
+   uint preferredFormat;
+};
+
+
+extern struct cell_winsys *
+cell_get_winsys(uint format);
+
+
+
+#endif
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 0000000000..2be9a2d324
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
new file mode 100644
index 0000000000..116453b79c
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -0,0 +1,82 @@
+# Gallium3D Cell driver: SPU code
+
+# This makefile builds the g3d_spu.a file that's linked into the
+# PPU code/library.
+
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+
+PROG = g3d
+
+PROG_SPU = $(PROG)_spu
+PROG_SPU_A = $(PROG)_spu.a
+PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
+
+
+SOURCES = \
+	spu_command.c \
+	spu_dcache.c \
+	spu_funcs.c \
+	spu_main.c \
+	spu_per_fragment_op.c \
+	spu_render.c \
+	spu_texture.c \
+	spu_tile.c \
+	spu_tri.c
+
+OLD_SOURCES = \
+	spu_exec.c \
+	spu_util.c \
+	spu_vertex_fetch.c \
+	spu_vertex_shader.c
+
+
+SPU_OBJECTS = $(SOURCES:.c=.o) \
+
+SPU_ASM_OUT = $(SOURCES:.c=.s) \
+
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers
+
+
+.c.o:
+	$(SPU_CC) $(SPU_CFLAGS) -c $<
+
+.c.s:
+	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<
+
+
+# The .a file will be linked into the main/PPU executable
+default: $(PROG_SPU_A)
+
+$(PROG_SPU_A): $(PROG_SPU_EMBED_O)
+	$(SPU_AR) $(SPU_AR_FLAGS) $(PROG_SPU_A) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU_EMBED_O): $(PROG_SPU)
+	$(SPU_EMBED) $(SPU_EMBED_FLAGS) $(PROG_SPU) $(PROG_SPU) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU): $(SPU_OBJECTS)
+	$(SPU_CC) -o $(PROG_SPU) $(SPU_OBJECTS) $(SPU_LFLAGS)
+
+
+
+asmfiles: $(SPU_ASM_OUT)
+
+
+clean:
+	rm -f *~ *.o *.a *.d *.s $(PROG_SPU)
+
+
+
+depend: $(SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null
+
+include depend
+
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
new file mode 100644
index 0000000000..d7ce005524
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -0,0 +1,145 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef SPU_COLORPACK_H
+#define SPU_COLORPACK_H
+
+
+#include <transpose_matrix4x4.h>
+#include <spu_intrinsics.h>
+
+
+static INLINE unsigned int
+spu_pack_R8G8B8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  0, 4, 8, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0 }) );
+
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_A8R8G8B8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  12, 0, 4, 8, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_B8G8R8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  8, 4, 0, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, shuffle);
+  return spu_extract(out, 0);
+}
+
+
+static INLINE vector float
+spu_unpack_B8G8R8A8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             2, 2, 2, 2,
+                             1, 1, 1, 1,
+                             0, 0, 0, 0,
+                             3, 3, 3, 3}) );
+   return spu_convtf(color_u4, 32);
+}
+
+
+static INLINE vector float
+spu_unpack_A8R8G8B8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             1, 1, 1, 1,
+                             2, 2, 2, 2,
+                             3, 3, 3, 3,
+                             0, 0, 0, 0}) );
+   return spu_convtf(color_u4, 32);
+}
+
+
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+                               vector float color_out[4])
+{
+   vector unsigned int c0;
+
+   c0 = spu_shuffle(color_in[0], color_in[0],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[0] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[1], color_in[1],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[1] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[2], color_in[2],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[2] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[3], color_in[3],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[3] = spu_convtf(c0, 32);
+
+   _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
+#endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 0000000000..5c0179d954
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,815 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+    ALIGN16_ATTRIB;
+
+
+
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE};
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+   static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED};
+   uint *dst = (uint *) fence_cmd->fence;
+   dst += 4 * spu.init.id;  /* main store/memory address, not local store */
+   ASSERT_ALIGN16(dst);
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_FENCE,           /* tag */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+   }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+   }
+
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+   }
+
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
+
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+   memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
+
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
+    */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
+   }
+
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
+
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static uint
+cmd_state_fs_constants(const qword *buffer, uint pos)
+{
+   const uint num_const = spu_extract((vector unsigned int)buffer[pos+1], 0);
+   const float *constants = (const float *) &buffer[pos+2];
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      D_PRINTF(CELL_DEBUG_CMD, "  const[%u] = %f\n", i, constants[i]);
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 16-byte words) */
+   return pos + 2 + (ROUNDUP16(num_const * sizeof(float)) / 16);
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
+      break;
+   default:
+      spu.fb.zsize = 0;
+      break;
+   }
+}
+
+
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+   uint unit = sampler->unit;
+
+   D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
+
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture_2d[unit] = sample_texture_2d_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   const uint unit = texture->unit;
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
+
+   spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
+      uint depth = texture->depth[i];
+
+      D_PRINTF(CELL_DEBUG_CMD, "  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
+
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
+
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
+
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
+
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB;
+   const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
+   uint pos;
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
+   release_buffer(buf);
+
+   /*
+    * Loop over commands in the batch buffer
+    */
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (si_to_uint(buffer[pos])) {
+      /*
+       * rendering commands
+       */
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 16;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += ((pos_incr+1)&~1) / 2; // should 'fix' cmd_render return
+         }
+         break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            /* This is a variant-sized command */
+            pos += ROUNDUP16(sizeof(*fops) + fops->total_code_size) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
+      case CELL_CMD_STATE_RASTERIZER:
+         {
+            struct cell_command_rasterizer *rast =
+               (struct cell_command_rasterizer *) &buffer[pos];
+            spu.rasterizer = rast->rasterizer;
+            pos += sizeof(*rast) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += 1 + ROUNDUP16(sizeof(struct vertex_info)) / 16;
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += 1 + ROUNDUP16(sizeof(struct pipe_viewport_state)) / 16;
+         break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t)spu_extract((vector unsigned int)buffer[pos+1],0);
+         pos += 2;
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_array_info)) / 16;
+         break;
+      case CELL_CMD_STATE_BIND_VS:
+#if 0
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+         pos += 1 + ROUNDUP16(sizeof(struct cell_shader_info)) / 16;
+         break;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_attribute_fetch_code)) / 16;
+         break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_FENCE:
+         {
+            struct cell_command_fence *fence_cmd =
+               (struct cell_command_fence *) &buffer[pos];
+            cmd_fence(fence_cmd);
+            pos += sizeof(*fence_cmd) / 16;
+         }
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 16;
+         }
+         break;
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_buffer_range)) / 16;
+	 break;
+      }
+      default:
+         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, si_to_uint(buffer[pos]));
+         ASSERT(0);
+         break;
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
+}
+
+
+#define PERF 0
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+   int exitFlag = 0;
+   uint t0, t1;
+
+   D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
+
+   while (!exitFlag) {
+      unsigned opcode;
+
+      D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
+
+      if (PERF)
+         spu_write_decrementer(~0);
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+      D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
+
+      if (PERF)
+         t0 = spu_read_decrementer();
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+#if 0
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+      }
+
+      if (PERF) {
+         t1 = spu_read_decrementer();
+         printf("wait mbox time: %gms   batch time: %gms\n",
+                (~0u - t0) * spu.init.inv_timebase,
+                (t0 - t1) * spu.init.inv_timebase);
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
+
+   if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+      spu_dcache_report();
+}
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 0000000000..83dcdade28
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+extern void
+command_loop(void);
+
+extern void
+spu_command_init(void);
+
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
new file mode 100644
index 0000000000..a6d67634fd
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -0,0 +1,145 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+#define CACHELINE_LOG2SIZE    7
+#define LINE_SIZE             (1U << 7)
+#define ALIGN_MASK            (~(LINE_SIZE - 1))
+
+#define CACHE_NAME            data
+#define CACHED_TYPE           qword
+#define CACHE_TYPE            CACHE_TYPE_RO
+#define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
+#define CACHE_LOG2NNWAY       2
+#define CACHE_LOG2NSETS       6
+#ifdef DEBUG
+#define CACHE_STATS           1
+#endif
+#include <cache-api.h>
+
+/* Yes folks, this is ugly.
+ */
+#undef CACHE_NWAY
+#undef CACHE_NSETS
+#define CACHE_NAME            data
+#define CACHE_NWAY            4
+#define CACHE_NSETS           (1U << 6)
+
+
+/**
+ * Fetch between arbitrary number of bytes from an unaligned address
+ *
+ * \param dst   Destination data buffer
+ * \param ea    Main memory effective address of source data
+ * \param size  Number of bytes to read
+ *
+ * \warning
+ * As is hinted by the type of the \c dst pointer, this function writes
+ * multiples of 16-bytes.
+ */
+void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
+{
+   const int shift = ea & 0x0f;
+   const unsigned read_size = ROUNDUP16(size + shift);
+   const unsigned last_read = ROUNDUP16(ea + size);
+   const qword *const last_write = dst + (ROUNDUP16(size) / 16);
+   unsigned i;
+
+
+   if (shift == 0) {
+      /* Data is already aligned.  Fetch directly into the destination buffer.
+       */
+      for (i = 0; i < size; i += 16) {
+         *(dst++) = cache_rd(data, ea + i);
+      }
+   } else {
+      qword hi;
+
+
+      /* Please exercise extreme caution when modifying this code.  This code
+       * must not read past the end of the page containing the source data,
+       * and it must not write more than ((size + 15) / 16) qwords to the
+       * destination buffer.
+       */
+      ea &= ~0x0f;
+      hi = cache_rd(data, ea);
+      for (i = 16; i < read_size; i += 16) {
+         qword lo = cache_rd(data, ea + i);
+
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift),
+                          (qword) spu_rlmaskqwbyte(lo, shift - 16));
+         hi = lo;
+      }
+
+      if (dst != last_write) {
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), si_il(0));
+      }
+   }
+   
+   ASSERT((ea + i) == last_read);
+   ASSERT(dst == last_write);
+}
+
+
+/**
+ * Notify the cache that a range of main memory may have been modified
+ */
+void
+spu_dcache_mark_dirty(unsigned ea, unsigned size)
+{
+   unsigned i;
+   const unsigned aligned_start = (ea & ALIGN_MASK);
+   const unsigned aligned_end = (ea + size + (LINE_SIZE - 1)) 
+       & ALIGN_MASK;
+
+
+   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
+      const unsigned entry = __cache_dir[i];
+      const unsigned addr = entry & ~0x0f;
+
+      __cache_dir[i] = ((addr >= aligned_start) && (addr < aligned_end))
+          ? (entry & ~CACHELINE_VALID) : entry;
+   }
+}
+
+
+/**
+ * Print cache utilization report
+ */
+void
+spu_dcache_report(void)
+{
+#ifdef CACHE_STATS
+   if (spu.init.id == 0) {
+      printf("SPU 0: Texture cache report:\n");
+      cache_pr_stats(data);
+   }
+#endif
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h
new file mode 100644
index 0000000000..39a19eb31b
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.h
@@ -0,0 +1,37 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SPU_DCACHE_H
+#define SPU_DCACHE_H
+
+extern void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size);
+
+extern void
+spu_dcache_mark_dirty(unsigned ea, unsigned size);
+
+extern void
+spu_dcache_report(void);
+
+#endif /* SPU_DCACHE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
new file mode 100644
index 0000000000..e27df2dfb3
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -0,0 +1,1933 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include <transpose_matrix4x4.h>
+#include <simdmath/ceilf4.h>
+#include <simdmath/cosf4.h>
+#include <simdmath/divf4.h>
+#include <simdmath/floorf4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+#include <simdmath/sinf4.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/truncf4.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call spu_exec_machine_run() many times.
+ */
+void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor)
+{
+   const qword zero = si_il(0);
+   const qword not_zero = si_il(~0);
+
+   (void) numSamplers;
+   mach->Samplers = samplers;
+   mach->Processor = processor;
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   /* Setup constants. */
+   mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
+   mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
+   mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
+   mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
+
+   mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
+   mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
+   mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
+   mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
+}
+
+
+static INLINE qword
+micro_abs(qword src)
+{
+   return si_rotmi(si_shli(src, 1), -1);
+}
+
+static INLINE qword
+micro_ceil(qword src)
+{
+   return (qword) _ceilf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_cos(qword src)
+{
+   return (qword) _cosf4((vec_float4) src);
+}
+
+static const qword br_shuf = {
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+};
+
+static const qword bl_shuf = {
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+};
+
+static const qword tl_shuf = {
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+};
+
+static qword
+micro_ddx(qword src)
+{
+   qword bottom_right = si_shufb(src, src, br_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(bottom_right, bottom_left);
+}
+
+static qword
+micro_ddy(qword src)
+{
+   qword top_left = si_shufb(src, src, tl_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(top_left, bottom_left);
+}
+
+static INLINE qword
+micro_div(qword src0, qword src1)
+{
+   return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_flr(qword src)
+{
+   return (qword) _floorf4((vec_float4) src);
+}
+
+static qword
+micro_frc(qword src)
+{
+   return si_fs(src, (qword) _floorf4((vec_float4) src));
+}
+
+static INLINE qword
+micro_ge(qword src0, qword src1)
+{
+   return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+}
+
+static qword
+micro_lg2(qword src)
+{
+   return (qword) _log2f4((vec_float4) src);
+}
+
+static INLINE qword
+micro_lt(qword src0, qword src1)
+{
+   const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+
+   return si_xori(tmp, 0xff);
+}
+
+static INLINE qword
+micro_max(qword src0, qword src1)
+{
+   return si_selb(src1, src0, si_fcgt(src0, src1));
+}
+
+static INLINE qword
+micro_min(qword src0, qword src1)
+{
+   return si_selb(src0, src1, si_fcgt(src0, src1));
+}
+
+static qword
+micro_neg(qword src)
+{
+   return si_xor(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_set_sign(qword src)
+{
+   return si_or(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_pow(qword src0, qword src1)
+{
+   return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_rnd(qword src)
+{
+   const qword half = (qword) spu_splats(0.5f);
+
+   /* May be able to use _roundf4.  There may be some difference, though.
+    */
+   return (qword) _floorf4((vec_float4) si_fa(src, half));
+}
+
+static INLINE qword
+micro_ishr(qword src0, qword src1)
+{
+   return si_rotma(src0, si_sfi(src1, 0));
+}
+
+static qword
+micro_trunc(qword src)
+{
+   return (qword) _truncf4((vec_float4) src);
+}
+
+static qword
+micro_sin(qword src)
+{
+   return (qword) _sinf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_sqrt(qword src)
+{
+   return (qword) _sqrtf4((vec_float4) src);
+}
+
+static void
+fetch_src_file_channel(
+   const struct spu_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union spu_exec_channel *index,
+   union spu_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT: {
+         unsigned i;
+
+         for (i = 0; i < 4; i++) {
+            const float *ptr = mach->Consts[index->i[i]];
+            float tmp[4];
+
+            spu_dcache_fetch_unaligned((qword *) tmp,
+                                       (uintptr_t)(ptr + swizzle),
+                                       sizeof(float));
+
+            chan->f[i] = tmp[0];
+         }
+         break;
+      }
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         ASSERT( index->i[0] < (int) mach->ImmLimit );
+         ASSERT( index->i[1] < (int) mach->ImmLimit );
+         ASSERT( index->i[2] < (int) mach->ImmLimit );
+         ASSERT( index->i[3] < (int) mach->ImmLimit );
+
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         ASSERT( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct spu_exec_machine *mach,
+   union spu_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union spu_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union spu_exec_channel index2;
+      union spu_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->SrcRegisterInd,
+                                                   CHAN_X);
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.q = si_a(index.q, indir_index.q);
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.q = si_mpyi(index.q, 17);
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.q = si_shli(index.q, 12);
+         break;
+      default:
+         ASSERT( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union spu_exec_channel index2;
+         union spu_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.q = si_a(index.q, indir_index.q);
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      chan->q = micro_abs(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      chan->q = micro_set_sign(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      chan->q = micro_neg(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
+   }
+}
+
+static void
+store_dest(
+   struct spu_exec_machine *mach,
+   const union spu_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union spu_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      ASSERT( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+      dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      ASSERT( 0 );
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kil(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union spu_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+/**
+ * Execute NVIDIA-style KIL which is predicated by a condition code.
+ * Kill fragment if the condition code is TRUE.
+ */
+static void
+exec_kilp(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+
+   /* TODO: build kilmask from CC mask */
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct spu_sampler *sampler,
+             const union spu_exec_channel *s,
+             const union spu_exec_channel *t,
+             const union spu_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union spu_exec_channel *r,
+             union spu_exec_channel *g,
+             union spu_exec_channel *b,
+             union spu_exec_channel *a )
+{
+   qword rgba[4];
+   qword out[4];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, 
+			(float (*)[4]) rgba);
+
+   _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba);
+   r->q = out[0];
+   g->q = out[1];
+   b->q = out[2];
+   a->q = out[3];
+}
+
+
+static void
+exec_tex(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod, boolean projected)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union spu_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      if (projected) {
+         FETCH(&r[1], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[1].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      ASSERT (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+
+static void
+constant_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+static void
+linear_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+static void
+perspective_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* interpolation_func)(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(struct spu_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         interpolation_func interp;
+
+         first = decl->DeclarationRange.First;
+         last = decl->DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Declaration.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = constant_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = linear_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = perspective_interpolation;
+            break;
+
+         default:
+            ASSERT( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  interp( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     interp( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct spu_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union spu_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+         r[0].q = si_cflts(r[0].q, 0);
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            FETCH( &r[1], 0, CHAN_Y );
+            r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+
+            FETCH( &r[2], 0, CHAN_W );
+            r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
+            r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
+            r[1].q = micro_pow(r[1].q, r[2].q);
+
+            /* r0 = (r0 > 0.0) ? r1 : 0.0
+             */
+            r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+            r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
+                             r[0].q);
+            STORE( &r[0], 0, CHAN_Z );
+         }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sqrt(r[0].q);
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fm(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fa(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+      r[0].q = si_fm(r[0].q, r[1].q);
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_min(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_max(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         FETCH( &r[2], 2, chan_index );
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fs(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         r[1].q = si_fs(r[1].q, r[2].q);
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_frc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_flr(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_rnd(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_lg2(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = micro_pow(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      /* r2 = (r0 * r1) - (r3 * r5)
+       */
+      r[2].q = si_fm(r[3].q, r[5].q);
+      r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+      FETCH(&r[5], 0, CHAN_X);
+
+      /* r3 = (r3 * r2) - (r1 * r5)
+       */
+      r[1].q = si_fm(r[1].q, r[5].q);
+      r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      /* r5 = (r5 * r4) - (r0 * r2)
+       */
+      r[0].q = si_fm(r[0].q, r[2].q);
+      r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       ASSERT (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          r[0].q = micro_abs(r[0].q);
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      r[0].q = si_fa(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_cos(r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddx(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddy(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      exec_kil (mach, inst);
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sin(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      /* Texture lookup with projection */
+      /* src[0] = texcoord (src[0].w = projection) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         ASSERT(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         ASSERT(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         ASSERT(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         ASSERT(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         /* r0 = (r0 < 0.0) ? r1 : r2
+          */
+         r[3].q = si_xor(r[3].q, r[3].q);
+         r[0].q = micro_lt(r[0].q, r[3].q);
+         r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         r[1].q = micro_cos(r[0].q);
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         r[1].q = micro_sin(r[0].q);
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         ASSERT(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      ASSERT(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       ASSERT (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ceil(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_csflt(r[0].q, 0);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_xorbi(r[0].q, 0xff);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_trunc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_shl(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ishr(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_and(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_or(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_xor(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      ASSERT(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         ASSERT(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         ASSERT(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+spu_exec_machine_run( struct spu_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */
+   ASSERT(mach->CondStackTop == 0);
+   ASSERT(mach->LoopStackTop == 0);
+   ASSERT(mach->ContStackTop == 0);
+   ASSERT(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      for (i = 0; i < mach->NumDeclarations; i++) {
+         union {
+            struct tgsi_full_declaration decl;
+            qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
+         } d ALIGN16_ATTRIB;
+         unsigned ea = (unsigned) (mach->Declarations + pc);
+
+         spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
+
+         exec_declaration( mach, &d.decl );
+      }
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      union {
+         struct tgsi_full_instruction inst;
+         qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
+      } i ALIGN16_ATTRIB;
+      unsigned ea = (unsigned) (mach->Instructions + pc);
+
+      spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
+      exec_instruction( mach, & i.inst, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
new file mode 100644
index 0000000000..8605679940
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -0,0 +1,172 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined SPU_EXEC_H
+#define SPU_EXEC_H
+
+#include "pipe/p_compiler.h"
+#include "tgsi/tgsi_exec.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union spu_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+   qword    q;
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct spu_exec_vector
+{
+   union spu_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct spu_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct spu_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct spu_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct spu_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    */
+   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
+                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1]
+       ALIGN16_ATTRIB;
+
+   struct spu_exec_vector       *Addrs;
+
+   struct spu_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct spu_exec_vector       *Inputs;
+   struct spu_exec_vector       *Outputs;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct spu_interp_coef *InterpCoefs;
+   struct spu_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+};
+
+
+extern void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor);
+
+extern uint
+spu_exec_machine_run( struct spu_exec_machine *mach );
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SPU_EXEC_H */
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 0000000000..ff3d609d25
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,173 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+#include "spu_texture.h"
+
+
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+   vector float v[4];
+};
+
+
+static vector float
+spu_cos(vector float x)
+{
+   return _cos14_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+   return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+   return _powf4(x, y);
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+   return _exp2f4(x);
+}
+
+static vector float
+spu_log2(vector float x)
+{
+   return _log2f4(x);
+}
+
+
+static struct vec_4x4
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) q;
+   sample_texture_cube(s, t, r, unit, colors.v);
+   return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
+static void
+export_func(struct cell_spu_function_info *spu_functions,
+            const char *name, void *addr)
+{
+   uint n = spu_functions->num;
+   ASSERT(strlen(name) < 16);
+   strcpy(spu_functions->names[n], name);
+   spu_functions->addrs[n] = (uint) addr;
+   spu_functions->num++;
+   ASSERT(spu_functions->num <= 16);
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   int tag = TAG_MISC;
+
+   ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+   funcs.num = 0;
+   export_func(&funcs, "spu_cos", &spu_cos);
+   export_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_pow", &spu_pow);
+   export_func(&funcs, "spu_exp2", &spu_exp2);
+   export_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+   export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+   export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
+
+   /* Send the function info back to the PPU / main memory */
+   mfc_put((void *) &funcs,  /* src in local store */
+           (unsigned int) spu.init.spu_functions, /* dst in main memory */
+           sizeof(funcs),  /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h
new file mode 100644
index 0000000000..3adb6ae99f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
+
+extern void
+return_function_info(void);
+
+#endif
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
new file mode 100644
index 0000000000..97c86d194d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -0,0 +1,117 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/* main() for Cell SPU code */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_funcs.h"
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+//#include "spu_test.h"
+#include "cell/common.h"
+
+
+/*
+helpful headers:
+/usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
+/opt/cell/sdk/usr/include/libmisc.h
+*/
+
+struct spu_global spu;
+
+
+static void
+one_time_init(void)
+{
+   memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
+   memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
+   invalidate_tex_cache();
+}
+
+/* In some versions of the SDK the SPE main takes 'unsigned long' as a
+ * parameter.  In others it takes 'unsigned long long'.  Use a define to
+ * select between the two.
+ */
+#ifdef SPU_MAIN_PARAM_LONG_LONG
+typedef unsigned long long main_param_t;
+#else
+typedef unsigned long main_param_t;
+#endif
+
+/**
+ * SPE entrypoint.
+ */
+int
+main(main_param_t speid, main_param_t argp)
+{
+   int tag = 0;
+
+   (void) speid;
+
+   ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+   ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
+
+   one_time_init();
+   spu_command_init();
+
+   D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
+
+   /* get initialization data */
+   mfc_get(&spu.init,  /* dest */
+           (unsigned int) argp, /* src */
+           sizeof(struct cell_init_info), /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask( 1 << tag );
+
+   if (spu.init.id == 0) {
+      return_function_info();
+   }
+
+#if 0
+   if (spu.init.id==0)
+      spu_test_misc(spu.init.id);
+#endif
+
+   command_loop();
+
+   spu_command_close();
+
+   return 0;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
new file mode 100644
index 0000000000..33767e7c51
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -0,0 +1,254 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_MAIN_H
+#define SPU_MAIN_H
+
+
+#include <spu_mfcio.h>
+
+#include "cell/common.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_state.h"
+
+
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
+
+
+/**
+ * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
+ * The data may be addressed through several different types.
+ */
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
+/** Function for sampling textures */
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+                                           vector float t,
+                                           uint unit, uint level, uint face,
+                                           vector float colors[4]);
+
+
+/** Function for performing per-fragment ops */
+typedef void (*spu_fragment_ops_func)(uint x, uint y,
+                                      tile_t *colorTile,
+                                      tile_t *depthStencilTile,
+                                      vector float fragZ,
+                                      vector float fragRed,
+                                      vector float fragGreen,
+                                      vector float fragBlue,
+                                      vector float fragAlpha,
+                                      vector unsigned int mask);
+
+/** Function for running fragment program */
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+                                                         vector float *outputs,
+                                                         vector float *constants);
+
+
+struct spu_framebuffer
+{
+   void *color_start;              /**< addr of color surface in main memory */
+   void *depth_start;              /**< addr of depth surface in main memory */
+   enum pipe_format color_format;
+   enum pipe_format depth_format;
+   uint width, height;             /**< size in pixels */
+   uint width_tiles, height_tiles; /**< width and height in tiles */
+
+   uint color_clear_value;
+   uint depth_clear_value;
+
+   uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
+} ALIGN16_ATTRIB;
+
+
+/** per-texture level info */
+struct spu_texture_level
+{
+   void *start;
+   ushort width, height, depth;
+   ushort tiles_per_row;
+   uint bytes_per_image;
+   /** texcoord scale factors */
+   vector float scale_s, scale_t, scale_r;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s, mask_t, mask_r;
+   /** texcoord clamp limits */
+   vector signed int max_s, max_t, max_r;
+} ALIGN16_ATTRIB;
+
+
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
+} ALIGN16_ATTRIB;
+
+
+/**
+ * All SPU global/context state will be in a singleton object of this type:
+ */
+struct spu_global
+{
+   /** One-time init/constant info */
+   struct cell_init_info init;
+
+   /*
+    * Current state
+    */
+   struct spu_framebuffer fb;
+   struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+   struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_rasterizer_state rasterizer;
+   struct spu_texture texture[PIPE_MAX_SAMPLERS];
+   struct vertex_info vertex_info;
+
+   /** Current color and Z tiles */
+   tile_t ctile ALIGN16_ATTRIB;
+   tile_t ztile ALIGN16_ATTRIB;
+
+   /** Read depth/stencil tiles? */
+   boolean read_depth_stencil;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status, cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+
+   /** Current fragment ops machine code, at 8-byte boundary */
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
+   /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+   spu_fragment_ops_func fragment_ops[2];
+
+   /** Current fragment program machine code, at 8-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
+   /** Current fragment ops function */
+   spu_fragment_program_func fragment_program;
+
+   /** Current texture sampler function */
+   spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
+
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
+
+} ALIGN16_ATTRIB;
+
+
+extern struct spu_global spu;
+
+
+
+/* DMA TAGS */
+
+#define TAG_SURFACE_CLEAR     10
+#define TAG_VERTEX_BUFFER     11
+#define TAG_READ_TILE_COLOR   12
+#define TAG_READ_TILE_Z       13
+#define TAG_WRITE_TILE_COLOR  14
+#define TAG_WRITE_TILE_Z      15
+#define TAG_INDEX_BUFFER      16
+#define TAG_BATCH_BUFFER      17
+#define TAG_MISC              18
+#define TAG_DCACHE0           20
+#define TAG_DCACHE1           21
+#define TAG_DCACHE2           22
+#define TAG_DCACHE3           23
+#define TAG_FENCE             24
+
+
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
+
+
+
+
+static INLINE void
+memset16(ushort *d, ushort value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+static INLINE void
+memset32(uint *d, uint value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+#endif /* SPU_MAIN_H */
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
new file mode 100644
index 0000000000..eba9f95cf1
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -0,0 +1,631 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \author Brian Paul
+ */
+
+
+#include <transpose_matrix4x4.h>
+#include "pipe/p_format.h"
+#include "spu_main.h"
+#include "spu_colorpack.h"
+#include "spu_per_fragment_op.h"
+
+
+#define LINEAR_QUAD_LAYOUT 1
+
+
+static INLINE vector float
+spu_min(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(a, b, m);
+}
+
+
+static INLINE vector float
+spu_max(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(b, a, m);
+}
+
+
+/**
+ * Called by rasterizer for each quad after the shader has run.  Do
+ * all the per-fragment operations including alpha test, z test,
+ * stencil test, blend, colormask and logicops.  This is a
+ * fallback/debug function.  In reality we'll use a generated function
+ * produced by the PPU.  But this function is useful for
+ * debug/validation.
+ */
+void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragR,
+                          vector float fragG,
+                          vector float fragB,
+                          vector float fragA,
+                          vector unsigned int mask)
+{
+   vector float frag_aos[4];
+   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */
+
+   /*
+    * Do alpha test
+    */
+   if (spu.depth_stencil_alpha.alpha.enabled) {
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref_value);
+      vector unsigned int amask;
+
+      switch (spu.depth_stencil_alpha.alpha.func) {
+      case PIPE_FUNC_LESS:
+         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */
+         break;
+      case PIPE_FUNC_GREATER:
+         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */
+         break;
+      case PIPE_FUNC_GEQUAL:
+         amask = spu_cmpgt(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_LEQUAL:
+         amask = spu_cmpgt(fragA, ref);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_EQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_ALWAYS:
+         amask = spu_splats(0xffffffffU);
+         break;
+      case PIPE_FUNC_NEVER:
+         amask = spu_splats( 0x0U);
+         break;
+      default:
+         ;
+      }
+
+      mask = spu_and(mask, amask);
+   }
+
+
+   /*
+    * Z and/or stencil testing...
+    */
+   if (spu.depth_stencil_alpha.depth.enabled ||
+       spu.depth_stencil_alpha.stencil[0].enabled) {
+
+      /* get four Z/Stencil values from tile */
+      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
+      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
+      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
+      vector unsigned int ifbS = spu_andc(ifbZS, mask24);
+
+      if (spu.depth_stencil_alpha.stencil[0].enabled) {
+         /* do stencil test */
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM);
+
+      }
+      else if (spu.depth_stencil_alpha.depth.enabled) {
+         /* do depth test */
+
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM ||
+                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM);
+
+         vector unsigned int ifragZ;
+         vector unsigned int zmask;
+
+         /* convert four fragZ from float to uint */
+         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
+         ifragZ = spu_convtu(fragZ, 0);
+
+         /* do depth comparison, setting zmask with results */
+         switch (spu.depth_stencil_alpha.depth.func) {
+         case PIPE_FUNC_LESS:
+            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
+            break;
+         case PIPE_FUNC_GREATER:
+            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
+            break;
+         case PIPE_FUNC_GEQUAL:
+            zmask = spu_cmpgt(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_LEQUAL:
+            zmask = spu_cmpgt(ifragZ, ifbZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_EQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_ALWAYS:
+            zmask = spu_splats(0xffffffffU);
+            break;
+         case PIPE_FUNC_NEVER:
+            zmask = spu_splats( 0x0U);
+            break;
+         default:
+            ;
+         }
+
+         mask = spu_and(mask, zmask);
+
+         /* merge framebuffer Z and fragment Z according to the mask */
+         ifbZ = spu_or(spu_and(ifragZ, mask),
+                       spu_andc(ifbZ, mask));
+      }
+
+      if (spu_extract(spu_orx(mask), 0)) {
+         /* put new fragment Z/Stencil values back into Z/Stencil tile */
+         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);
+
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+   }
+
+
+   /*
+    * If we'll need the current framebuffer/tile colors for blending
+    * or logicop or colormask, fetch them now.
+    */
+   if (spu.blend.blend_enable ||
+       spu.blend.logicop_enable ||
+       spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+      fbc0 = colorTile->ui[y][x*2+0];
+      fbc1 = colorTile->ui[y][x*2+1];
+      fbc2 = colorTile->ui[y][x*2+2];
+      fbc3 = colorTile->ui[y][x*2+3];
+#else
+      fbc0 = colorTile->ui[y+0][x+0];
+      fbc1 = colorTile->ui[y+0][x+1];
+      fbc2 = colorTile->ui[y+1][x+0];
+      fbc3 = colorTile->ui[y+1][x+1];
+#endif
+   }
+
+
+   /*
+    * Do blending
+    */
+   if (spu.blend.blend_enable) {
+      /* blending terms, misc regs */
+      vector float term1r, term1g, term1b, term1a;
+      vector float term2r, term2g, term2b, term2a;
+      vector float one, tmp;
+
+      vector float fbRGBA[4];  /* current framebuffer colors */
+
+      /* convert framebuffer colors from packed int to vector float */
+      {
+         vector float temp[4]; /* float colors in AOS form */
+         switch (spu.fb.color_format) {
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            temp[0] = spu_unpack_B8G8R8A8(fbc0);
+            temp[1] = spu_unpack_B8G8R8A8(fbc1);
+            temp[2] = spu_unpack_B8G8R8A8(fbc2);
+            temp[3] = spu_unpack_B8G8R8A8(fbc3);
+            break;
+         case PIPE_FORMAT_A8R8G8B8_UNORM:
+            temp[0] = spu_unpack_A8R8G8B8(fbc0);
+            temp[1] = spu_unpack_A8R8G8B8(fbc1);
+            temp[2] = spu_unpack_A8R8G8B8(fbc2);
+            temp[3] = spu_unpack_A8R8G8B8(fbc3);
+            break;
+         default:
+            ASSERT(0);
+         }
+         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
+      }
+
+      /*
+       * Compute Src RGB terms (fragment color * factor)
+       */
+      switch (spu.blend.rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1r = fragR;
+         term1g = fragG;
+         term1b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term1r =
+         term1g =
+         term1b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1r = spu_mul(fragR, fragR);
+         term1g = spu_mul(fragG, fragG);
+         term1b = spu_mul(fragB, fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1r = spu_mul(fragR, fragA);
+         term1g = spu_mul(fragG, fragA);
+         term1b = spu_mul(fragB, fragA);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term1r = spu_mul(fragR, fbRGBA[0]);
+         term1g = spu_mul(fragG, fbRGBA[1]);
+         term1b = spu_mul(fragB, fbRGBA[1]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1r = spu_mul(fragR, fbRGBA[3]);
+         term1g = spu_mul(fragG, fbRGBA[3]);
+         term1b = spu_mul(fragB, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Src Alpha term (fragment alpha * factor)
+       */
+      switch (spu.blend.alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1a = spu_mul(fragA, fragA);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1a = spu_mul(fragA, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest RGB terms (framebuffer color * factor)
+       */
+      switch (spu.blend.rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2r = fbRGBA[0];
+         term2g = fbRGBA[1];
+         term2b = fbRGBA[2];
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term2r =
+         term2g =
+         term2b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2r = spu_mul(fbRGBA[0], fragR);
+         term2g = spu_mul(fbRGBA[1], fragG);
+         term2b = spu_mul(fbRGBA[2], fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fragA);
+         term2g = spu_mul(fbRGBA[1], fragA);
+         term2b = spu_mul(fbRGBA[2], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2r = spu_mul(fbRGBA[0], tmp);
+         term2g = spu_mul(fbRGBA[1], tmp);
+         term2b = spu_mul(fbRGBA[2], tmp);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+         break;
+       /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest Alpha term (framebuffer alpha * factor)
+       */
+      switch (spu.blend.alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2a = fbRGBA[3];
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2a = spu_mul(fbRGBA[3], tmp);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest RGB terms
+       */
+      switch (spu.blend.rgb_func) {
+      case PIPE_BLEND_ADD:
+         fragR = spu_add(term1r, term2r);
+         fragG = spu_add(term1g, term2g);
+         fragB = spu_add(term1b, term2b);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragR = spu_sub(term1r, term2r);
+         fragG = spu_sub(term1g, term2g);
+         fragB = spu_sub(term1b, term2b);
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragR = spu_sub(term2r, term1r);
+         fragG = spu_sub(term2g, term1g);
+         fragB = spu_sub(term2b, term1b);
+         break;
+      case PIPE_BLEND_MIN:
+         fragR = spu_min(term1r, term2r);
+         fragG = spu_min(term1g, term2g);
+         fragB = spu_min(term1b, term2b);
+         break;
+      case PIPE_BLEND_MAX:
+         fragR = spu_max(term1r, term2r);
+         fragG = spu_max(term1g, term2g);
+         fragB = spu_max(term1b, term2b);
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest A term
+       */
+      switch (spu.blend.alpha_func) {
+      case PIPE_BLEND_ADD:
+         fragA = spu_add(term1a, term2a);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragA = spu_sub(term1a, term2a);
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragA = spu_sub(term2a, term1a);
+         break;
+      case PIPE_BLEND_MIN:
+         fragA = spu_min(term1a, term2a);
+         break;
+      case PIPE_BLEND_MAX:
+         fragA = spu_max(term1a, term2a);
+         break;
+      default:
+         ASSERT(0);
+      }
+   }
+
+
+   /*
+    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA.
+    */
+#if 0
+   /* original code */
+   {
+      vector float frag_soa[4];
+      frag_soa[0] = fragR;
+      frag_soa[1] = fragG;
+      frag_soa[2] = fragB;
+      frag_soa[3] = fragA;
+      _transpose_matrix4x4(frag_aos, frag_soa);
+   }
+#else
+   /* short-cut relying on function parameter layout: */
+   _transpose_matrix4x4(frag_aos, &fragR);
+   (void) fragG;
+   (void) fragB;
+#endif
+
+   /*
+    * Pack fragment float colors into 32-bit RGBA words.
+    */
+   switch (spu.fb.color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      break;
+   default:
+      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
+      ASSERT(0);
+   }
+
+
+   /*
+    * Do color masking
+    */
+   if (spu.blend.colormask != 0xf) {
+      uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+      /* Form bitmask depending on color buffer format and colormask bits */
+      switch (spu.fb.color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x00ff0000; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x0000ff00; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0x000000ff; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0xff000000; /* alpha */
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x0000ff00; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x00ff0000; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0xff000000; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0x000000ff; /* alpha */
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Apply color mask to the 32-bit packed colors.
+       * if (cmask[i])
+       *    frag color[i] = frag color[i];
+       * else
+       *    frag color[i] = framebuffer color[i];
+       */
+      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
+   }
+
+
+   /*
+    * Do logic ops
+    */
+   if (spu.blend.logicop_enable) {
+      /* XXX to do */
+      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
+   }
+
+
+   /*
+    * If mask is non-zero, mark tile as dirty.
+    */
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   else {
+      /* write no fragments */
+      return;
+   }
+
+
+   /*
+    * Write new fragment/quad colors to the framebuffer/tile.
+    * Only write pixels where the corresponding mask word is set.
+    */
+#if LINEAR_QUAD_LAYOUT
+   /*
+    * Quad layout:
+    *  +--+--+--+--+
+    *  |p0|p1|p2|p3|...
+    *  +--+--+--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y][x*2] = fragc0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y][x*2+1] = fragc1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y][x*2+2] = fragc2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y][x*2+3] = fragc3;
+#else
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|...
+    *  +--+--+
+    *  |p2|p3|...
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = fragc0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = fragc1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = fragc2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = fragc3;
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
new file mode 100644
index 0000000000..f817abf046
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -0,0 +1,44 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_PER_FRAGMENT_OP
+#define SPU_PER_FRAGMENT_OP
+
+
+extern void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask);
+
+
+#endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
new file mode 100644
index 0000000000..7c225e2f27
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -0,0 +1,295 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_tri.h"
+#include "spu_tile.h"
+#include "cell/common.h"
+#include "util/u_memory.h"
+
+
+/**
+ * Given a rendering command's bounding box (in pixels) compute the
+ * location of the corresponding screen tile bounding box.
+ */
+static INLINE void
+tile_bounding_box(const struct cell_command_render *render,
+                  uint *txmin, uint *tymin,
+                  uint *box_num_tiles, uint *box_width_tiles)
+{
+#if 0
+   /* Debug: full-window bounding box */
+   uint txmax = spu.fb.width_tiles - 1;
+   uint tymax = spu.fb.height_tiles - 1;
+   *txmin = 0;
+   *tymin = 0;
+   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   *box_width_tiles = spu.fb.width_tiles;
+   (void) render;
+   (void) txmax;
+   (void) tymax;
+#else
+   uint txmax, tymax, box_height_tiles;
+
+   *txmin = (uint) render->xmin / TILE_SIZE;
+   *tymin = (uint) render->ymin / TILE_SIZE;
+   txmax = (uint) render->xmax / TILE_SIZE;
+   tymax = (uint) render->ymax / TILE_SIZE;
+   if (txmax >= spu.fb.width_tiles)
+      txmax = spu.fb.width_tiles-1;
+   if (tymax >= spu.fb.height_tiles)
+      tymax = spu.fb.height_tiles-1;
+   *box_width_tiles = txmax - *txmin + 1;
+   box_height_tiles = tymax - *tymin + 1;
+   *box_num_tiles = *box_width_tiles * box_height_tiles;
+#endif
+#if 0
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
+          render->xmin, render->ymin, render->xmax, render->ymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
+#endif
+}
+
+
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
+/**
+ * Start fetching non-clear color/Z tiles from main memory
+ */
+static INLINE void
+get_cz_tiles(uint tx, uint ty)
+{
+   if (spu.read_depth_stencil) {
+      if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
+         get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
+         spu.cur_ztile_status = TILE_STATUS_GETTING;
+      }
+   }
+
+   if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
+      get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_GETTING;
+   }
+}
+
+
+/**
+ * Start putting dirty color/Z tiles back to main memory
+ */
+static INLINE void
+put_cz_tiles(uint tx, uint ty)
+{
+   if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
+   }
+
+   if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
+   }
+}
+
+
+/**
+ * Wait for 'put' of color/z tiles to complete.
+ */
+static INLINE void
+wait_put_cz_tiles(void)
+{
+   wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+   if (spu.read_depth_stencil) {
+      wait_on_mask(1 << TAG_WRITE_TILE_Z);
+   }
+}
+
+
+/**
+ * Render primitives
+ * \param pos_incr  returns value indicating how may words to skip after
+ *                  this command in the batch buffer
+ */
+void
+cmd_render(const struct cell_command_render *render, uint *pos_incr)
+{
+   /* we'll DMA into these buffers */
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   const uint vertex_size = render->vertex_size; /* in bytes */
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   uint index_bytes;
+   const ubyte *vertices;
+   const ushort *indexes;
+   uint i, j;
+   uint num_tiles;
+
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+            render->prim_type,
+            render->num_verts,
+            render->num_indexes,
+            render->inline_verts);
+
+   ASSERT(sizeof(*render) % 4 == 0);
+   ASSERT(total_vertex_bytes % 16 == 0);
+   ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+   ASSERT(render->num_indexes % 3 == 0);
+
+
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   index_bytes = ROUNDUP8(render->num_indexes * 2);
+   *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
+
+
+   if (render->inline_verts) {
+      /* Vertices are after indexes in batch buffer at next 16-byte addr */
+      vertices = (const ubyte *) render + (*pos_incr * 8);
+      vertices = (const ubyte *) align_pointer((void *) vertices, 16);
+      ASSERT_ALIGN16(vertices);
+      *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
+              total_vertex_bytes,  /* size */
+              TAG_VERTEX_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+
+      vertices = vertex_data;
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+   }
+
+
+   /**
+    ** find tiles which intersect the prim bounding box
+    **/
+   uint txmin, tymin, box_width_tiles, box_num_tiles;
+   tile_bounding_box(render, &txmin, &tymin,
+                     &box_num_tiles, &box_width_tiles);
+
+
+   /* make sure any pending clears have completed */
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+
+
+   num_tiles = 0;
+
+   /**
+    ** loop over tiles, rendering tris
+    **/
+   for (i = 0; i < box_num_tiles; i++) {
+      const uint tx = txmin + i % box_width_tiles;
+      const uint ty = tymin + i / box_width_tiles;
+
+      ASSERT(tx < spu.fb.width_tiles);
+      ASSERT(ty < spu.fb.height_tiles);
+
+      if (!my_tile(tx, ty))
+         continue;
+
+      num_tiles++;
+
+      spu.cur_ctile_status = spu.ctile_status[ty][tx];
+      spu.cur_ztile_status = spu.ztile_status[ty][tx];
+
+      get_cz_tiles(tx, ty);
+
+      uint drawn = 0;
+
+      /* loop over tris */
+      for (j = 0; j < render->num_indexes; j += 3) {
+         const float *v0, *v1, *v2;
+
+         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
+         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
+         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
+
+         drawn += tri_draw(v0, v1, v2, tx, ty);
+      }
+
+      //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
+
+      /* write color/z tiles back to main framebuffer, if dirtied */
+      put_cz_tiles(tx, ty);
+
+      wait_put_cz_tiles(); /* XXX seems unnecessary... */
+
+      spu.ctile_status[ty][tx] = spu.cur_ctile_status;
+      spu.ztile_status[ty][tx] = spu.cur_ztile_status;
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER done (%u tiles hit)\n",
+            num_tiles);
+}
diff --git a/src/gallium/drivers/cell/spu/spu_render.h b/src/gallium/drivers/cell/spu/spu_render.h
new file mode 100644
index 0000000000..493434f087
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_RENDER_H
+#define SPU_RENDER_H
+
+#include "cell/common.h"
+
+extern void
+cmd_render(const struct cell_command_render *render, uint *pos_incr);
+
+#endif /* SPU_RENDER_H */
+
diff --git a/src/gallium/drivers/cell/spu/spu_shuffle.h b/src/gallium/drivers/cell/spu/spu_shuffle.h
new file mode 100644
index 0000000000..74f2a0b6d2
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_shuffle.h
@@ -0,0 +1,186 @@
+#ifndef SPU_SHUFFLE_H
+#define SPU_SHUFFLE_H
+
+/*
+ * Generate shuffle patterns with minimal fuss.
+ *
+ * Based on ideas from 
+ * http://www.insomniacgames.com/tech/articles/0408/files/shuffles.pdf
+ *
+ * A-P indicates 0-15th position in first vector
+ * a-p indicates 0-15th position in second vector
+ *
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |00|01|02|03|04|05|06|07|08|09|0a|0b|0c|0d|0e|0f|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |          A|          B|          C|          D|
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |    A|    B|    C|    D|    E|    F|    G|    H|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * | A| B| C| D| E| F| G| H| I| J| K| L| M| N| O| P|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ *
+ * x or X indicates 0xff
+ * 8 indicates 0x80
+ * 0 indicates 0x00
+ *
+ * The macros SHUFFLE4() SHUFFLE8() and SHUFFLE16() provide a const vector 
+ * unsigned char literal suitable for use with spu_shuffle().
+ *
+ * The macros SHUFB4() SHUFB8() and SHUFB16() provide a const qword vector 
+ * literal suitable for use with si_shufb().
+ *
+ *
+ * For example :
+ * SHUFB4(A,A,A,A)
+ * expands to :
+ * ((const qword){0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3})
+ * 
+ * SHUFFLE8(A,B,a,b,C,c,8,8)
+ * expands to :
+ * ((const vector unsigned char){0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,
+ *				 0x04,0x05,0x14,0x15,0xe0,0xe0,0xe0,0xe0})
+ *
+ */
+
+#include <spu_intrinsics.h>
+
+#define SHUFFLE_PATTERN_4_A__  0x00, 0x01, 0x02, 0x03
+#define SHUFFLE_PATTERN_4_B__  0x04, 0x05, 0x06, 0x07
+#define SHUFFLE_PATTERN_4_C__  0x08, 0x09, 0x0a, 0x0b
+#define SHUFFLE_PATTERN_4_D__  0x0c, 0x0d, 0x0e, 0x0f
+#define SHUFFLE_PATTERN_4_a__  0x10, 0x11, 0x12, 0x13
+#define SHUFFLE_PATTERN_4_b__  0x14, 0x15, 0x16, 0x17
+#define SHUFFLE_PATTERN_4_c__  0x18, 0x19, 0x1a, 0x1b
+#define SHUFFLE_PATTERN_4_d__  0x1c, 0x1d, 0x1e, 0x1f
+#define SHUFFLE_PATTERN_4_X__  0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_x__  0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_0__  0x80, 0x80, 0x80, 0x80
+#define SHUFFLE_PATTERN_4_8__  0xe0, 0xe0, 0xe0, 0xe0
+
+#define SHUFFLE_VECTOR_4__(A, B, C, D) \
+   SHUFFLE_PATTERN_4_##A##__, \
+   SHUFFLE_PATTERN_4_##B##__, \
+   SHUFFLE_PATTERN_4_##C##__, \
+   SHUFFLE_PATTERN_4_##D##__
+
+#define SHUFFLE4(A, B, C, D) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_4__(A, B, C, D) \
+   })
+
+#define SHUFB4(A, B, C, D) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_4__(A, B, C, D) \
+   })
+
+
+#define SHUFFLE_PATTERN_8_A__  0x00, 0x01
+#define SHUFFLE_PATTERN_8_B__  0x02, 0x03
+#define SHUFFLE_PATTERN_8_C__  0x04, 0x05
+#define SHUFFLE_PATTERN_8_D__  0x06, 0x07
+#define SHUFFLE_PATTERN_8_E__  0x08, 0x09
+#define SHUFFLE_PATTERN_8_F__  0x0a, 0x0b
+#define SHUFFLE_PATTERN_8_G__  0x0c, 0x0d
+#define SHUFFLE_PATTERN_8_H__  0x0e, 0x0f
+#define SHUFFLE_PATTERN_8_a__  0x10, 0x11
+#define SHUFFLE_PATTERN_8_b__  0x12, 0x13
+#define SHUFFLE_PATTERN_8_c__  0x14, 0x15
+#define SHUFFLE_PATTERN_8_d__  0x16, 0x17
+#define SHUFFLE_PATTERN_8_e__  0x18, 0x19
+#define SHUFFLE_PATTERN_8_f__  0x1a, 0x1b
+#define SHUFFLE_PATTERN_8_g__  0x1c, 0x1d
+#define SHUFFLE_PATTERN_8_h__  0x1e, 0x1f
+#define SHUFFLE_PATTERN_8_X__  0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_x__  0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_0__  0x80, 0x80
+#define SHUFFLE_PATTERN_8_8__  0xe0, 0xe0
+
+
+#define SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   SHUFFLE_PATTERN_8_##A##__, \
+   SHUFFLE_PATTERN_8_##B##__, \
+   SHUFFLE_PATTERN_8_##C##__, \
+   SHUFFLE_PATTERN_8_##D##__, \
+   SHUFFLE_PATTERN_8_##E##__, \
+   SHUFFLE_PATTERN_8_##F##__, \
+   SHUFFLE_PATTERN_8_##G##__, \
+   SHUFFLE_PATTERN_8_##H##__
+
+#define SHUFFLE8(A, B, C, D, E, F, G, H) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   })
+
+#define SHUFB8(A, B, C, D, E, F, G, H) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   })
+
+
+#define SHUFFLE_PATTERN_16_A__  0x00
+#define SHUFFLE_PATTERN_16_B__  0x01
+#define SHUFFLE_PATTERN_16_C__  0x02
+#define SHUFFLE_PATTERN_16_D__  0x03
+#define SHUFFLE_PATTERN_16_E__  0x04
+#define SHUFFLE_PATTERN_16_F__  0x05
+#define SHUFFLE_PATTERN_16_G__  0x06
+#define SHUFFLE_PATTERN_16_H__  0x07
+#define SHUFFLE_PATTERN_16_I__  0x08
+#define SHUFFLE_PATTERN_16_J__  0x09
+#define SHUFFLE_PATTERN_16_K__  0x0a
+#define SHUFFLE_PATTERN_16_L__  0x0b
+#define SHUFFLE_PATTERN_16_M__  0x0c
+#define SHUFFLE_PATTERN_16_N__  0x0d
+#define SHUFFLE_PATTERN_16_O__  0x0e
+#define SHUFFLE_PATTERN_16_P__  0x0f
+#define SHUFFLE_PATTERN_16_a__  0x10
+#define SHUFFLE_PATTERN_16_b__  0x11
+#define SHUFFLE_PATTERN_16_c__  0x12
+#define SHUFFLE_PATTERN_16_d__  0x13
+#define SHUFFLE_PATTERN_16_e__  0x14
+#define SHUFFLE_PATTERN_16_f__  0x15
+#define SHUFFLE_PATTERN_16_g__  0x16
+#define SHUFFLE_PATTERN_16_h__  0x17
+#define SHUFFLE_PATTERN_16_i__  0x18
+#define SHUFFLE_PATTERN_16_j__  0x19
+#define SHUFFLE_PATTERN_16_k__  0x1a
+#define SHUFFLE_PATTERN_16_l__  0x1b
+#define SHUFFLE_PATTERN_16_m__  0x1c
+#define SHUFFLE_PATTERN_16_n__  0x1d
+#define SHUFFLE_PATTERN_16_o__  0x1e
+#define SHUFFLE_PATTERN_16_p__  0x1f
+#define SHUFFLE_PATTERN_16_X__  0xc0
+#define SHUFFLE_PATTERN_16_x__  0xc0
+#define SHUFFLE_PATTERN_16_0__  0x80
+#define SHUFFLE_PATTERN_16_8__  0xe0
+
+#define SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   SHUFFLE_PATTERN_16_##A##__, \
+   SHUFFLE_PATTERN_16_##B##__, \
+   SHUFFLE_PATTERN_16_##C##__, \
+   SHUFFLE_PATTERN_16_##D##__, \
+   SHUFFLE_PATTERN_16_##E##__, \
+   SHUFFLE_PATTERN_16_##F##__, \
+   SHUFFLE_PATTERN_16_##G##__, \
+   SHUFFLE_PATTERN_16_##H##__, \
+   SHUFFLE_PATTERN_16_##I##__, \
+   SHUFFLE_PATTERN_16_##J##__, \
+   SHUFFLE_PATTERN_16_##K##__, \
+   SHUFFLE_PATTERN_16_##L##__, \
+   SHUFFLE_PATTERN_16_##M##__, \
+   SHUFFLE_PATTERN_16_##N##__, \
+   SHUFFLE_PATTERN_16_##O##__, \
+   SHUFFLE_PATTERN_16_##P##__
+
+#define SHUFFLE16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   })
+
+#define SHUFB16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   })
+
+#endif
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
new file mode 100644
index 0000000000..69784c8978
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -0,0 +1,641 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <math.h>
+
+#include "pipe/p_compiler.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_colorpack.h"
+#include "spu_dcache.h"
+
+
+/**
+ * Mark all tex cache entries as invalid.
+ */
+void
+invalidate_tex_cache(void)
+{
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
+
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
+
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
+}
+
+
+/**
+ * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
+ *
+ * NOTE: in the typical case of bilinear filtering, the four texels
+ * are in a 2x2 group so we could get by with just two dcache fetches
+ * (two side-by-side texels per fetch).  But when bilinear filtering
+ * wraps around a texture edge, we'll probably need code like we have
+ * now.
+ * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
+ * it's quite likely that the four pixels in a quad will need some of the
+ * same texels.  So look into doing texture fetches for four pixels at
+ * a time.
+ */
+static void
+get_four_texels(const struct spu_texture_level *tlevel, uint face,
+                vec_int4 x, vec_int4 y,
+                vec_uint4 *texels)
+{
+   unsigned texture_ea = (uintptr_t) tlevel->start;
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
+
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
+   const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
+
+   qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
+   tile_offset = si_mpy((qword) tile_offset, tile_size);
+
+   qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
+   texel_offset = si_mpyui(texel_offset, 4);
+   
+   vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
+   
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
+   spu_dcache_fetch_unaligned((qword *) & texels[0],
+                              texture_ea + spu_extract(offset, 0), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[1],
+                              texture_ea + spu_extract(offset, 1), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[2],
+                              texture_ea + spu_extract(offset, 2), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[3],
+                              texture_ea + spu_extract(offset, 3), 4);
+}
+
+
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
+
+/**
+ * Do nearest texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
+   vec_uint4 texels[4];
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
+
+   get_four_texels(tlevel, face, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
+}
+
+
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
+
+   /* is + 1, it + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut0,
+          vector unsigned int *mOut1,
+          vector unsigned int *mOut2,
+          vector unsigned int *mOut3,
+          vector unsigned int *mIn)
+{
+  vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
+  vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
+  vector unsigned int aibj, ckdl, emfn, gohp;	/* intermediate vectors */
+
+  vector unsigned char shufflehi = ((vector unsigned char) {
+					       0x00, 0x01, 0x02, 0x03,
+					       0x10, 0x11, 0x12, 0x13,
+					       0x04, 0x05, 0x06, 0x07,
+					       0x14, 0x15, 0x16, 0x17});
+  vector unsigned char shufflelo = ((vector unsigned char) {
+					       0x08, 0x09, 0x0A, 0x0B,
+					       0x18, 0x19, 0x1A, 0x1B,
+					       0x0C, 0x0D, 0x0E, 0x0F,
+					       0x1C, 0x1D, 0x1E, 0x1F});
+  abcd = *(mIn+0);
+  efgh = *(mIn+1);
+  ijkl = *(mIn+2);
+  mnop = *(mIn+3);
+
+  aibj = spu_shuffle(abcd, ijkl, shufflehi);
+  ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+  emfn = spu_shuffle(efgh, mnop, shufflehi);
+  gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+  aeim = spu_shuffle(aibj, emfn, shufflehi);
+  bfjn = spu_shuffle(aibj, emfn, shufflelo);
+  cgko = spu_shuffle(ckdl, gohp, shufflehi);
+  dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+  *mOut0 = aeim;
+  *mOut1 = bfjn;
+  *mOut2 = cgko;
+  *mOut3 = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int instead of float arithmetic for computing
+ * sample weights.
+ */
+void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   /* Scale texcoords by size of texture, and add half pixel bias */
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+   /* convert float coords to fixed-pt coords with 7 fraction bits */
+   vector signed int is = spu_convts(ss, 7);  /* XXX really need floor() here */
+   vector signed int it = spu_convts(tt, 7);  /* XXX really need floor() here */
+
+   /* compute integer texel weights in [0, 127] */
+   vector signed int sWeights0 = spu_and(is, 127);
+   vector signed int tWeights0 = spu_and(it, 127);
+   vector signed int sWeights1 = spu_sub(127, sWeights0);
+   vector signed int tWeights1 = spu_sub(127, tWeights0);
+
+   /* texel coords: is0 = is / 128, it0 = is / 128 */
+   vector signed int is0 = spu_rlmask(is, -7);
+   vector signed int it0 = spu_rlmask(it, -7);
+
+   /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+
+   /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+   {
+      static const unsigned char ZERO = 0x80;
+      int i;
+      for (i = 0; i < 16; i++) {
+         texels[i] = spu_shuffle(texels[i], texels[i],
+                                 ((vector unsigned char) {
+                                    ZERO, ZERO, ZERO, 1,
+                                    ZERO, ZERO, ZERO, 2,
+                                    ZERO, ZERO, ZERO, 3,
+                                    ZERO, ZERO, ZERO, 0}));
+      }
+   }
+
+   /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+   vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+      texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+   transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+   transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+   transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+   transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
+
+   /* computed weighted colors */
+   vector unsigned int c0, c1, c2, c3, cSum;
+
+   /* red */
+   c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[0] = spu_convtf(cSum, 22);
+
+   /* green */
+   c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[1] = spu_convtf(cSum, 22);
+
+   /* blue */
+   c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[2] = spu_convtf(cSum, 22);
+
+   /* alpha */
+   c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[3] = spu_convtf(cSum, 22);
+}
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static INLINE float
+compute_lambda_2d(uint unit, vector float s, vector float t)
+{
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+#if 0
+   /* ideal value */
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+#else
+   /* approximation */
+   dsdx = fabsf(dsdx);
+   dsdy = fabsf(dsdy);
+   dtdx = fabsf(dtdx);
+   dtdy = fabsf(dtdy);
+   float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
+#endif
+   float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
+   return lambda;
+}
+
+
+/**
+ * Blend two sets of colors according to weight.
+ */
+static void
+blend_colors(vector float c0[4], const vector float c1[4], float weight)
+{
+   vector float t = spu_splats(weight);
+   vector float dc0 = spu_sub(c1[0], c0[0]);
+   vector float dc1 = spu_sub(c1[1], c0[1]);
+   vector float dc2 = spu_sub(c1[2], c0[2]);
+   vector float dc3 = spu_sub(c1[3], c0[3]);
+   c0[0] = spu_madd(dc0, t, c0[0]);
+   c0[1] = spu_madd(dc1, t, c0[1]);
+   c0[2] = spu_madd(dc2, t, c0[2]);
+   c0[3] = spu_madd(dc3, t, c0[3]);
+}
+
+
+/**
+ * Texture sampling with level of detail selection and possibly mipmap
+ * interpolation.
+ */
+void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level_ignored, uint face,
+                      vector float colors[4])
+{
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
+   float lambda = compute_lambda_2d(unit, s, t);
+
+   (void) face;
+   (void) level_ignored;
+
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
+   }
+   else {
+      /* minify */
+      if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+         /* sample two mipmap levels and interpolate */
+         int level = (int) lambda;
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+         if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+            /* sample second mipmap level */
+            float weight = lambda - (float) level;
+            level++;
+            if (level <= (int) spu.texture[unit].max_level) {
+               vector float colors2[4];
+               spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
+               blend_colors(colors, colors2, weight);
+            }
+         }
+      }
+      else {
+         /* sample one mipmap level */
+         int level = (int) (lambda + 0.5f);
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+      }
+   }
+}
+
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4])
+{
+   uint p, faces[4], level = 0;
+   float newS[4], newT[4];
+
+   /* Compute cube faces referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
+                                     unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
new file mode 100644
index 0000000000..7b75b007b5
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -0,0 +1,67 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TEXTURE_H
+#define SPU_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern void
+invalidate_tex_cache(void);
+
+
+extern void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4]);
+
+
+extern void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
+
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4]);
+
+
+extern void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level, uint face,
+                      vector float colors[4]);
+
+
+extern void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4]);
+
+
+#endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
new file mode 100644
index 0000000000..6905015a48
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -0,0 +1,126 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#include "spu_tile.h"
+#include "spu_main.h"
+
+
+/**
+ * Get tile of color or Z values from main memory, put into SPU memory.
+ */
+void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   const ubyte *src = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   src += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("get_tile:  dest: %p  src: 0x%x  size: %d\n",
+          tile, (unsigned int) src, bytesPerTile);
+   */
+   mfc_get(tile->ui,  /* dest in local memory */
+           (unsigned int) src, /* src in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * Move tile of color or Z values from SPU memory to main memory.
+ */
+void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   ubyte *dst = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   dst += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("SPU %u: put_tile:  src: %p  dst: 0x%x  size: %d\n",
+          spu.init.id,
+          tile, (unsigned int) dst, bytesPerTile);
+   */
+   mfc_put((void *) tile->ui,  /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (surfaceIndex == 0) {
+      clear_c_tile(&spu.ctile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         }
+      }
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
new file mode 100644
index 0000000000..7bfb52be8f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -0,0 +1,75 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TILE_H
+#define SPU_TILE_H
+
+
+#include <libmisc.h>
+#include <spu_mfcio.h>
+#include "spu_main.h"
+#include "cell/common.h"
+
+
+
+extern void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
+
+extern void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
+
+extern void
+really_clear_tiles(uint surfaceIndex);
+
+
+static INLINE void
+clear_c_tile(tile_t *ctile)
+{
+   memset32((uint*) ctile->ui,
+            spu.fb.color_clear_value,
+            TILE_SIZE * TILE_SIZE);
+}
+
+
+static INLINE void
+clear_z_tile(tile_t *ztile)
+{
+   if (spu.fb.zsize == 2) {
+      memset16((ushort*) ztile->us,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+   else {
+      ASSERT(spu.fb.zsize != 0);
+      memset32((uint*) ztile->ui,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+}
+
+
+#endif /* SPU_TILE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
new file mode 100644
index 0000000000..0d9fcb9997
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -0,0 +1,809 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Triangle rendering within a tile.
+ */
+
+#include <transpose_matrix4x4.h>
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "util/u_math.h"
+#include "spu_colorpack.h"
+#include "spu_main.h"
+#include "spu_shuffle.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_tri.h"
+
+
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
+typedef vector unsigned int mask_t;
+
+
+
+/**
+ * Simplified types taken from other parts of Gallium
+ */
+struct vertex_header {
+   vector float data[1];
+};
+
+
+
+/* XXX fix this */
+#undef CEILF
+#define CEILF(X) ((float) (int) ((X) + 0.99999f))
+
+
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+#define DEBUG_VERTS 0
+
+/**
+ * Triangle edge info
+ */
+struct edge {
+   union {
+      struct {
+         float dx;	/**< X(v1) - X(v0), used only during setup */
+         float dy;	/**< Y(v1) - Y(v0), used only during setup */
+      };
+      vec_float4 ds;    /**< vector accessor for dx and dy */
+   };
+   float dxdy;		/**< dx/dy */
+   float sx, sy;	/**< first sample point coord */
+   int lines;		/**< number of lines on this edge */
+};
+
+
+struct interp_coef
+{
+   vector float a0;
+   vector float dadx;
+   vector float dady;
+};
+
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_stage {
+
+   /* Vertices are just an array of floats making up each attribute in
+    * turn.  Currently fixed at 4 floats, but should change in time.
+    * Codegen will help cope with this.
+    */
+   union {
+      struct {
+         const struct vertex_header *vmin;
+         const struct vertex_header *vmid;
+         const struct vertex_header *vmax;
+         const struct vertex_header *vprovoke;
+      };
+      qword vertex_headers;
+   };
+
+   struct edge ebot;
+   struct edge etop;
+   struct edge emaj;
+
+   float oneOverArea;  /* XXX maybe make into vector? */
+
+   uint facing;
+
+   uint tx, ty;  /**< position of current tile (x, y) */
+
+   int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
+
+   struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+
+   struct {
+      vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
+      int y;
+      unsigned y_flags;
+      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
+   } span;
+};
+
+
+static struct setup_stage setup;
+
+
+/**
+ * Evaluate attribute coefficients (plane equations) to compute
+ * attribute values for the four fragments in a quad.
+ * Eg: four colors will be computed (in AoS format).
+ */
+static INLINE void
+eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
+{
+   switch (spu.vertex_info.attrib[slot].interp_mode) {
+   case INTERP_CONSTANT:
+      result[QUAD_TOP_LEFT] =
+      result[QUAD_TOP_RIGHT] =
+      result[QUAD_BOTTOM_LEFT] =
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
+      break;
+   case INTERP_LINEAR:
+      {
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
+
+         result[QUAD_TOP_LEFT] = topLeft;
+         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
+      }
+      break;
+   case INTERP_PERSPECTIVE:
+      {
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
+
+         vector float wInv = spu_re(w);  /* 1.0 / w */
+
+         result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
+         result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
+         result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
+         result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
+      }
+      break;
+   case INTERP_POS:
+   case INTERP_NONE:
+      break;
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
+{
+   eval_coeff(slot, x, y, w, result);
+   _transpose_matrix4x4(result, result);
+}
+
+
+/** Evalute coefficients to get Z for four pixels in a quad */
+static INLINE vector float
+eval_z(float x, float y)
+{
+   const uint slot = 0;
+   const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
+   const float dzdy = spu_extract(setup.coef[slot].dady, 2);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
+/** Evalute coefficients to get W for four pixels in a quad */
+static INLINE vector float
+eval_w(float x, float y)
+{
+   const uint slot = 0;
+   const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
+   const float dwdy = spu_extract(setup.coef[slot].dady, 3);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
+/**
+ * Emit a quad (pass to next stage).  No clipping is done.
+ * Note: about 1/5 to 1/7 of the time, mask is zero and this function
+ * should be skipped.  But adding the test for that slows things down
+ * overall.
+ */
+static INLINE void
+emit_quad( int x, int y, mask_t mask)
+{
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
+      const int ix = x - setup.cliprect_minx;
+      const int iy = y - setup.cliprect_miny;
+
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+
+      {
+         /*
+          * Run fragment shader, execute per-fragment ops, update fb/tile.
+          */
+         vector float inputs[4*4], outputs[2*4];
+         vector float fragZ = eval_z((float) x, (float) y);
+         vector float fragW = eval_w((float) x, (float) y);
+         vector unsigned int kill_mask;
+
+         /* setup inputs */
+#if 0
+         eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
+#else
+         uint i;
+         for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+            eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
+         }
+#endif
+         ASSERT(spu.fragment_program);
+         ASSERT(spu.fragment_ops);
+
+         /* Execute the current fragment program */
+         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+         mask = spu_andc(mask, kill_mask);
+
+         /* Execute per-fragment/quad operations, including:
+          * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
+          */
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ,
+                          outputs[0*4+0],
+                          outputs[0*4+1],
+                          outputs[0*4+2],
+                          outputs[0*4+3],
+                          mask);
+      }
+   }
+}
+
+
+/**
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
+ */
+static INLINE int
+block(int x)
+{
+   return x & ~1;
+}
+
+
+/**
+ * Render a horizontal span of quads
+ */
+static void
+flush_spans(void)
+{
+   int minleft, maxright;
+
+   const int l0 = spu_extract(setup.span.quad, 0);
+   const int l1 = spu_extract(setup.span.quad, 1);
+   const int r0 = spu_extract(setup.span.quad, 2);
+   const int r1 = spu_extract(setup.span.quad, 3);
+
+   switch (setup.span.y_flags) {
+   case 0x3:
+      /* both odd and even lines written (both quad rows) */
+      minleft = MIN2(l0, l1);
+      maxright = MAX2(r0, r1);
+      break;
+
+   case 0x1:
+      /* only even line written (quad top row) */
+      minleft = l0;
+      maxright = r0;
+      break;
+
+   case 0x2:
+      /* only odd line written (quad bottom row) */
+      minleft = l1;
+      maxright = r1;
+      break;
+
+   default:
+      return;
+   }
+
+   /* OK, we're very likely to need the tile data now.
+    * clear or finish waiting if needed.
+    */
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+      clear_c_tile(&spu.ctile);
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
+
+   if (spu.read_depth_stencil) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
+      }
+      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+         clear_z_tile(&spu.ztile);
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
+   }
+
+   /* XXX this loop could be moved into the above switch cases... */
+   
+   /* Setup for mask calculation */
+   const vec_int4 quad_LlRr = setup.span.quad;
+   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
+   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
+   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
+
+   const vec_int4 twos = spu_splats(2);
+
+   const int x = block(minleft);
+   vec_int4 xs = {x, x+1, x, x+1};
+
+   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
+      /**
+       * Computes mask to indicate which pixels in the 2x2 quad are actually
+       * inside the triangle's bounds.
+       */
+      
+      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
+      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
+      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
+      
+      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
+      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
+
+      /* Combine results to create mask */
+      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
+
+      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
+   }
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
+}
+
+
+#if DEBUG_VERTS
+static void
+print_vertex(const struct vertex_header *v)
+{
+   uint i;
+   fprintf(stderr, "  Vertex: (%p)\n", v);
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
+              spu_extract(v->data[i], 0),
+              spu_extract(v->data[i], 1),
+              spu_extract(v->data[i], 2),
+              spu_extract(v->data[i], 3));
+   }
+}
+#endif
+
+
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return  FALSE if tri is totally outside tile, TRUE otherwise
+ */
+static boolean
+setup_sort_vertices(const struct vertex_header *v0,
+                    const struct vertex_header *v1,
+                    const struct vertex_header *v2)
+{
+   float area, sign;
+
+#if DEBUG_VERTS
+   if (spu.init.id==0) {
+      fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
+      print_vertex(v0);
+      print_vertex(v1);
+      print_vertex(v2);
+   }
+#endif
+
+   /* determine bottom to top order of vertices */
+   {
+      /* A table of shuffle patterns for putting vertex_header pointers into
+         correct order.  Quite magical. */
+      const vec_uchar16 sort_order_patterns[] = {
+         SHUFFLE4(A,B,C,C),
+         SHUFFLE4(C,A,B,C),
+         SHUFFLE4(A,C,B,C),
+         SHUFFLE4(B,C,A,C),
+         SHUFFLE4(B,A,C,C),
+         SHUFFLE4(C,B,A,C) };
+
+      /* The vertex_header pointers, packed for easy shuffling later */
+      const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2};
+
+      /* Collate y values into two vectors for comparison.
+         Using only one shuffle constant! ;) */
+      const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C));
+      const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C));
+      const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C));
+      const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C));
+
+      /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
+      const vec_uint4 compare = spu_cmpgt(y_012, y_120);
+      /* Compress the result of the comparison into 4 bits */
+      const vec_uint4 gather = spu_gather(compare);
+      /* Subtract one to attain the index into the LUT.  Magical. */
+      const unsigned int index = spu_extract(gather, 0) - 1;
+
+      /* Load the appropriate pattern and construct the desired vector. */
+      setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]);
+
+      /* Using the result of the comparison, set sign.
+         Very magical. */
+      sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f);
+   }
+
+   /* Check if triangle is completely outside the tile bounds */
+   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
+      return FALSE;
+   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
+      return FALSE;
+   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
+      return FALSE;
+   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
+      return FALSE;
+
+   setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
+   setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
+   setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
+
+   /*
+    * Compute triangle's area.  Use 1/area to compute partial
+    * derivatives of attributes later.
+    */
+   area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
+
+   setup.oneOverArea = 1.0f / area;
+
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
+    */
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
+   setup.facing = (area * sign > 0.0f)
+      ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
+
+   return TRUE;
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex->data[slot].
+ * The result will be put into setup.coef[slot].a0.
+ * \param slot  which attribute slot 
+ */
+static INLINE void
+const_coeff4(uint slot)
+{
+   setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0 = setup.vprovoke->data[slot];
+}
+
+
+/**
+ * As above, but interp setup all four vector components.
+ */
+static INLINE void
+tri_linear_coeff4(uint slot)
+{
+   const vector float vmin_d = setup.vmin->data[slot];
+   const vector float vmid_d = setup.vmid->data[slot];
+   const vector float vmax_d = setup.vmax->data[slot];
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
+                         
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void
+tri_persp_coeff4(uint slot)
+{
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+   const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+   const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
+
+   vector float vmin_d = setup.vmin->data[slot];
+   vector float vmid_d = setup.vmid->data[slot];
+   vector float vmax_d = setup.vmax->data[slot];
+
+   vmin_d = spu_mul(vmin_d, vmin_w);
+   vmid_d = spu_mul(vmid_d, vmid_w);
+   vmax_d = spu_mul(vmax_d, vmax_w);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
+                         
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+
+/**
+ * Compute the setup.coef[] array dadx, dady, a0 values.
+ * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
+ */
+static void
+setup_tri_coefficients(void)
+{
+   uint i;
+
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      switch (spu.vertex_info.attrib[i].interp_mode) {
+      case INTERP_NONE:
+         break;
+      case INTERP_CONSTANT:
+         const_coeff4(i);
+         break;
+      case INTERP_POS:
+         /* fall-through */
+      case INTERP_LINEAR:
+         tri_linear_coeff4(i);
+         break;
+      case INTERP_PERSPECTIVE:
+         tri_persp_coeff4(i);
+         break;
+      default:
+         ASSERT(0);
+      }
+   }
+}
+
+
+static void
+setup_tri_edges(void)
+{
+   float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
+   float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
+
+   float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+   float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
+   float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
+
+   setup.emaj.sy = CEILF(vmin_y);
+   setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
+   setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
+   setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
+
+   setup.etop.sy = CEILF(vmid_y);
+   setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
+   setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
+   setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
+
+   setup.ebot.sy = CEILF(vmin_y);
+   setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
+   setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
+   setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
+}
+
+
+/**
+ * Render the upper or lower half of a triangle.
+ * Scissoring/cliprect is applied here too.
+ */
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
+{
+   const int minx = setup.cliprect_minx;
+   const int maxx = setup.cliprect_maxx;
+   const int miny = setup.cliprect_miny;
+   const int maxy = setup.cliprect_maxy;
+   int y, start_y, finish_y;
+   int sy = (int)eleft->sy;
+
+   ASSERT((int)eleft->sy == (int) eright->sy);
+
+   /* clip top/bottom */
+   start_y = sy;
+   finish_y = sy + lines;
+
+   if (start_y < miny)
+      start_y = miny;
+
+   if (finish_y > maxy)
+      finish_y = maxy;
+
+   start_y -= sy;
+   finish_y -= sy;
+
+   /*
+   _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
+   */
+
+   for (y = start_y; y < finish_y; y++) {
+
+      /* avoid accumulating adds as floats don't have the precision to
+       * accurately iterate large triangle edges that way.  luckily we
+       * can just multiply these days.
+       *
+       * this is all drowned out by the attribute interpolation anyway.
+       */
+      int left = (int)(eleft->sx + y * eleft->dxdy);
+      int right = (int)(eright->sx + y * eright->dxdy);
+
+      /* clip left/right */
+      if (left < minx)
+         left = minx;
+      if (right > maxx)
+         right = maxx;
+
+      if (left < right) {
+         int _y = sy + y;
+         if (block(_y) != setup.span.y) {
+            flush_spans();
+            setup.span.y = block(_y);
+         }
+
+         int offset = _y&1;
+         vec_int4 quad_LlRr = {left, left, right, right};
+         /* Store left and right in 0 or 1 row of quad based on offset */
+         setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
+         setup.span.y_flags |= 1<<offset;
+      }
+   }
+
+
+   /* save the values so that emaj can be restarted:
+    */
+   eleft->sx += lines * eleft->dxdy;
+   eright->sx += lines * eright->dxdy;
+   eleft->sy += lines;
+   eright->sy += lines;
+}
+
+
+/**
+ * Draw triangle into tile at (tx, ty) (tile coords)
+ * The tile data should have already been fetched.
+ */
+boolean
+tri_draw(const float *v0, const float *v1, const float *v2,
+         uint tx, uint ty)
+{
+   setup.tx = tx;
+   setup.ty = ty;
+
+   /* set clipping bounds to tile bounds */
+   setup.cliprect_minx = tx * TILE_SIZE;
+   setup.cliprect_miny = ty * TILE_SIZE;
+   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
+   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
+
+   if (!setup_sort_vertices((struct vertex_header *) v0,
+                            (struct vertex_header *) v1,
+                            (struct vertex_header *) v2)) {
+      return FALSE; /* totally clipped */
+   }
+
+   setup_tri_coefficients();
+   setup_tri_edges();
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
+
+   if (setup.oneOverArea < 0.0) {
+      /* emaj on left */
+      subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
+      subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
+   }
+   else {
+      /* emaj on right */
+      subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
+      subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
+   }
+
+   flush_spans();
+
+   return TRUE;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
new file mode 100644
index 0000000000..aa694dd7c9
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_TRI_H
+#define SPU_TRI_H
+
+
+extern boolean
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+
+
+#endif /* SPU_TRI_H */
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
new file mode 100644
index 0000000000..b8a0d4a265
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -0,0 +1,167 @@
+
+#include "cell/common.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_debug.h"
+#include "tgsi/tgsi_parse.h"
+//#include "tgsi_build.h"
+#include "tgsi/tgsi_util.h"
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   ASSERT (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   ASSERT (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   ASSERT (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   ASSERT (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         component );
+   }
+
+   return swizzle;
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      ASSERT( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..03375d84a5
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,145 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+typedef void (*spu_fetch_func)(qword *out, const qword *in,
+			       const qword *shuffle_data);
+
+
+static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
+   /* Shuffle used by CVT_64_FLOAT
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+   },
+
+   /* Shuffle used by CVT_8_USCALED and CVT_8_SSCALED
+    */
+   {
+      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80,
+      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80,
+   },
+   
+   /* Shuffle used by CVT_16_USCALED and CVT_16_SSCALED
+    */
+   {
+      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80,
+      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80,
+   },
+   
+   /* High value shuffle used by trans4x4.
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17
+   },
+
+   /* Low value shuffle used by trans4x4.
+    */
+   {
+      0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+      0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F
+   }
+};
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   ASSERT(count <= 4);
+
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = (spu_fetch_func)
+	  (draw->vertex_fetch.code + draw->vertex_fetch.code_offset[attr]);
+      unsigned i;
+      unsigned idx;
+      const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
+      const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
+      qword in[2 * 4] ALIGN16_ATTRIB;
+
+
+      /* Fetch four attributes for four vertices.  
+       */
+      idx = 0;
+      for (i = 0; i < count; i++) {
+         const uint64_t addr = src + (elts[i] * pitch);
+
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
+
+         spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry);
+         idx += quads_per_entry;
+      }
+
+      /* Be nice and zero out any missing vertices.
+       */
+      (void) memset(& in[idx], 0, (8 - idx) * sizeof(qword));
+
+
+      /* Convert all 4 vertices to vectors of float.
+       */
+      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, fetch_shuffle_data);
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
new file mode 100644
index 0000000000..fbe5b34d39
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,244 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include <spu_mfcio.h>
+
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "cell/common.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+
+
+#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
+
+
+#define CLIP_RIGHT_BIT 0x01
+#define CLIP_LEFT_BIT 0x02
+#define CLIP_TOP_BIT 0x04
+#define CLIP_BOTTOM_BIT 0x08
+#define CLIP_FAR_BIT 0x10
+#define CLIP_NEAR_BIT 0x20
+
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   return (a[0]*b[0] +
+           a[1]*b[1] +
+           a[2]*b[2] +
+           a[3]*b[3]);
+}
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct spu_vs_context *draw,
+                   unsigned elts[4], unsigned count,
+                   const uint64_t *vOut)
+{
+   struct spu_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_MAX_ATTRIBS);
+   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   ASSERT(count <= 4);
+
+   machine->Processor = TGSI_PROCESSOR_VERTEX;
+
+   ASSERT_ALIGN16(draw->constants);
+   machine->Consts = (float (*)[4]) draw->constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   spu_vertex_fetch( draw, machine, elts, count );
+
+   /* run shader */
+   spu_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+      unsigned char buffer[sizeof(struct vertex_header)
+          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+      struct vertex_header *const tmpOut =
+          (struct vertex_header *) buffer;
+      const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
+                                           + (sizeof(float) * 4 
+                                              * draw->num_vs_outputs));
+
+      mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
+					   draw->nr_planes);
+      tmpOut->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      }
+
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+   } /* loop over vertices */
+}
+
+
+unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
+    ALIGN16_ATTRIB;
+
+
+void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs)
+{
+   const unsigned immediate_addr = vs->immediates;
+   const unsigned immediate_size = 
+       ROUNDUP16((sizeof(float) * 4 * vs->num_immediates)
+		 + (immediate_addr & 0x0f));
+ 
+
+   mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
+           TAG_VERTEX_BUFFER, 0, 0);
+
+   draw->machine.Instructions = (struct tgsi_full_instruction *)
+       vs->instructions;
+   draw->machine.NumInstructions = vs->num_instructions;
+
+   draw->machine.Declarations = (struct tgsi_full_declaration *)
+       vs->declarations;
+   draw->machine.NumDeclarations = vs->num_declarations;
+
+   draw->num_vs_outputs = vs->num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+   (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
+                 sizeof(float) * 4 * vs->num_immediates);
+}
+
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+                          const struct cell_command_vs *vs)
+{
+   unsigned i;
+
+   (void) memcpy(draw->plane, vs->plane, sizeof(float) * 4 * vs->nr_planes);
+   draw->nr_planes = vs->nr_planes;
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+
+   for (i = 0; i < vs->num_elts; i += 4) {
+      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
+
+      run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]);
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
new file mode 100644
index 0000000000..4c74f5e74d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,66 @@
+#ifndef SPU_VERTEX_SHADER_H
+#define SPU_VERTEX_SHADER_H
+
+#include "cell/common.h"
+#include "pipe/p_format.h"
+#include "spu_exec.h"
+
+struct spu_vs_context;
+
+typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
+				     struct spu_exec_machine *machine,
+				     const unsigned *elts,
+				     unsigned count );
+
+struct spu_vs_context {
+   struct pipe_viewport_state viewport;
+
+   struct {
+      uint64_t src_ptr[PIPE_MAX_ATTRIBS];
+      unsigned pitch[PIPE_MAX_ATTRIBS];
+      unsigned size[PIPE_MAX_ATTRIBS];
+      unsigned code_offset[PIPE_MAX_ATTRIBS];
+      unsigned nr_attrs;
+      boolean dirty;
+
+      spu_full_fetch_func fetch_func;
+      void *code;
+   } vertex_fetch;
+   
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   struct spu_exec_machine machine;
+   const float (*constants)[4];
+
+   unsigned num_vs_outputs;
+};
+
+extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
+
+static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
+				    struct spu_exec_machine *machine,
+				    const unsigned *elts,
+				    unsigned count)
+{
+   if (draw->vertex_fetch.dirty) {
+      spu_update_vertex_fetch(draw);
+      draw->vertex_fetch.dirty = 0;
+   }
+   
+   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
+}
+
+struct cell_command_vs;
+
+extern void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs);
+
+extern void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs);
+
+#endif /* SPU_VERTEX_SHADER_H */
diff --git a/src/gallium/drivers/failover/Makefile b/src/gallium/drivers/failover/Makefile
new file mode 100644
index 0000000000..f08b8df07a
--- /dev/null
+++ b/src/gallium/drivers/failover/Makefile
@@ -0,0 +1,14 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = failover
+
+C_SOURCES = \
+	fo_state.c \
+	fo_state_emit.c \
+	fo_context.c 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/failover/SConscript b/src/gallium/drivers/failover/SConscript
new file mode 100644
index 0000000000..f8e9b1b491
--- /dev/null
+++ b/src/gallium/drivers/failover/SConscript
@@ -0,0 +1,13 @@
+Import('*')
+
+env = env.Clone()
+
+failover = env.ConvenienceLibrary(
+	target = 'failover',
+	source = [
+		'fo_state.c',
+		'fo_state_emit.c',
+		'fo_context.c',
+	])
+
+Export('failover')
diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c
new file mode 100644
index 0000000000..0742b27b8f
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_context.c
@@ -0,0 +1,159 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_memory.h"
+#include "pipe/p_context.h"
+
+#include "fo_context.h"
+#include "fo_winsys.h"
+
+
+
+static void failover_destroy( struct pipe_context *pipe )
+{
+   struct failover_context *failover = failover_context( pipe );
+
+   free( failover );
+}
+
+
+
+static boolean failover_draw_elements( struct pipe_context *pipe,
+				       struct pipe_buffer *indexBuffer,
+				       unsigned indexSize,
+				       unsigned prim, unsigned start, unsigned count)
+{
+   struct failover_context *failover = failover_context( pipe );
+
+   /* If there has been any statechange since last time, try hardware
+    * rendering again:
+    */
+   if (failover->dirty) {
+      failover->mode = FO_HW;
+   }
+
+   /* Try hardware:
+    */
+   if (failover->mode == FO_HW) {
+      if (!failover->hw->draw_elements( failover->hw, 
+					indexBuffer, 
+					indexSize, 
+					prim, 
+					start, 
+					count )) {
+
+	 failover->hw->flush( failover->hw, ~0, NULL );
+	 failover->mode = FO_SW;
+      }
+   }
+
+   /* Possibly try software:
+    */
+   if (failover->mode == FO_SW) {
+
+      if (failover->dirty) 
+	 failover_state_emit( failover );
+
+      failover->sw->draw_elements( failover->sw, 
+				   indexBuffer, 
+				   indexSize, 
+				   prim, 
+				   start, 
+				   count );
+
+      /* Be ready to switch back to hardware rendering without an
+       * intervening flush.  Unlikely to be much performance impact to
+       * this:
+       */
+      failover->sw->flush( failover->sw, ~0, NULL );
+   }
+
+   return TRUE;
+}
+
+
+static boolean failover_draw_arrays( struct pipe_context *pipe,
+				     unsigned prim, unsigned start, unsigned count)
+{
+   return failover_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
+struct pipe_context *failover_create( struct pipe_context *hw,
+				      struct pipe_context *sw )
+{
+   struct failover_context *failover = CALLOC_STRUCT(failover_context);
+   if (failover == NULL)
+      return NULL;
+
+   failover->hw = hw;
+   failover->sw = sw;
+   failover->pipe.winsys = hw->winsys;
+   failover->pipe.screen = hw->screen;
+   failover->pipe.destroy = failover_destroy;
+#if 0
+   failover->pipe.is_format_supported = hw->is_format_supported;
+   failover->pipe.get_name = hw->get_name;
+   failover->pipe.get_vendor = hw->get_vendor;
+   failover->pipe.get_param = hw->get_param;
+   failover->pipe.get_paramf = hw->get_paramf;
+#endif
+
+   failover->pipe.draw_arrays = failover_draw_arrays;
+   failover->pipe.draw_elements = failover_draw_elements;
+   failover->pipe.clear = hw->clear;
+
+   /* No software occlusion fallback (or other optional functionality)
+    * at this point - if the hardware doesn't support it, don't
+    * advertise it to the application.
+    */
+   failover->pipe.begin_query = hw->begin_query;
+   failover->pipe.end_query = hw->end_query;
+
+   failover_init_state_functions( failover );
+
+   failover->pipe.surface_copy = hw->surface_copy;
+   failover->pipe.surface_fill = hw->surface_fill;
+
+#if 0
+   failover->pipe.texture_create = hw->texture_create;
+   failover->pipe.texture_release = hw->texture_release;
+   failover->pipe.get_tex_surface = hw->get_tex_surface;
+   failover->pipe.texture_update = hw->texture_update;
+#endif
+
+   failover->pipe.flush = hw->flush;
+
+   failover->dirty = 0;
+
+   return &failover->pipe;
+}
+
diff --git a/src/gallium/drivers/failover/fo_context.h b/src/gallium/drivers/failover/fo_context.h
new file mode 100644
index 0000000000..9ba86ba866
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_context.h
@@ -0,0 +1,125 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef FO_CONTEXT_H
+#define FO_CONTEXT_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+
+
+
+#define FO_NEW_VIEWPORT        0x1
+#define FO_NEW_RASTERIZER      0x2
+#define FO_NEW_FRAGMENT_SHADER 0x4
+#define FO_NEW_BLEND           0x8
+#define FO_NEW_CLIP            0x10
+#define FO_NEW_SCISSOR         0x20
+#define FO_NEW_STIPPLE         0x40
+#define FO_NEW_FRAMEBUFFER     0x80
+#define FO_NEW_ALPHA_TEST      0x100
+#define FO_NEW_DEPTH_STENCIL   0x200
+#define FO_NEW_SAMPLER         0x400
+#define FO_NEW_TEXTURE         0x800
+#define FO_NEW_VERTEX          0x2000
+#define FO_NEW_VERTEX_SHADER   0x4000
+#define FO_NEW_BLEND_COLOR     0x8000
+#define FO_NEW_CLEAR_COLOR     0x10000
+#define FO_NEW_VERTEX_BUFFER   0x20000
+#define FO_NEW_VERTEX_ELEMENT  0x40000
+
+
+
+#define FO_HW 0
+#define FO_SW 1
+
+struct fo_state {
+   void *sw_state;
+   void *hw_state;
+};
+struct failover_context {
+   struct pipe_context pipe;  /**< base class */
+
+
+   /* The most recent drawing state as set by the driver:
+    */
+   const struct fo_state     *blend;
+   const struct fo_state     *sampler[PIPE_MAX_SAMPLERS];
+   const struct fo_state     *depth_stencil;
+   const struct fo_state     *rasterizer;
+   const struct fo_state     *fragment_shader;
+   const struct fo_state     *vertex_shader;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element vertex_elements[PIPE_MAX_ATTRIBS];
+
+   uint num_vertex_buffers;
+   uint num_vertex_elements;
+
+   void *sw_sampler_state[PIPE_MAX_SAMPLERS];
+   void *hw_sampler_state[PIPE_MAX_SAMPLERS];
+
+   unsigned dirty;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+
+   unsigned mode;
+   struct pipe_context *hw;
+   struct pipe_context *sw;
+};
+
+
+
+void failover_init_state_functions( struct failover_context *failover );
+void failover_state_emit( struct failover_context *failover );
+
+static INLINE struct failover_context *
+failover_context( struct pipe_context *pipe )
+{
+   return (struct failover_context *)pipe;
+}
+
+/* Internal functions
+ */
+void
+failover_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf);
+
+
+#endif /* FO_CONTEXT_H */
diff --git a/src/gallium/drivers/failover/fo_state.c b/src/gallium/drivers/failover/fo_state.c
new file mode 100644
index 0000000000..6a79706632
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_state.c
@@ -0,0 +1,483 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "pipe/p_inlines.h"
+
+#include "fo_context.h"
+
+
+/* This looks like a lot of work at the moment - we're keeping a
+ * duplicate copy of the state up-to-date.  
+ *
+ * This can change in two ways:
+ * - With constant state objects we would only need to save a pointer,
+ *     not the whole object.
+ * - By adding a callback in the state tracker to re-emit state.  The
+ *     state tracker knows the current state already and can re-emit it 
+ *     without additional complexity.
+ *
+ * This works as a proof-of-concept, but a final version will have
+ * lower overheads.
+ */
+
+
+
+static void *
+failover_create_blend_state( struct pipe_context *pipe,
+                             const struct pipe_blend_state *blend )
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_blend_state(failover->sw, blend);
+   state->hw_state = failover->hw->create_blend_state(failover->hw, blend);
+
+   return state;
+}
+
+static void
+failover_bind_blend_state( struct pipe_context *pipe,
+                           void *blend )
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state *)blend;
+   failover->blend = state;
+   failover->dirty |= FO_NEW_BLEND;
+   failover->sw->bind_blend_state( failover->sw, state->sw_state );
+   failover->hw->bind_blend_state( failover->hw, state->hw_state );
+}
+
+static void
+failover_delete_blend_state( struct pipe_context *pipe,
+                             void *blend )
+{
+   struct fo_state *state = (struct fo_state*)blend;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_blend_state(failover->sw, state->sw_state);
+   failover->hw->delete_blend_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+static void
+failover_set_blend_color( struct pipe_context *pipe,
+			  const struct pipe_blend_color *blend_color )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->blend_color = *blend_color;
+   failover->dirty |= FO_NEW_BLEND_COLOR;
+   failover->sw->set_blend_color( failover->sw, blend_color );
+   failover->hw->set_blend_color( failover->hw, blend_color );
+}
+
+static void 
+failover_set_clip_state( struct pipe_context *pipe,
+			 const struct pipe_clip_state *clip )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->clip = *clip;
+   failover->dirty |= FO_NEW_CLIP;
+   failover->sw->set_clip_state( failover->sw, clip );
+   failover->hw->set_clip_state( failover->hw, clip );
+}
+
+
+static void *
+failover_create_depth_stencil_state(struct pipe_context *pipe,
+                              const struct pipe_depth_stencil_alpha_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_depth_stencil_alpha_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_depth_stencil_alpha_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_depth_stencil_state(struct pipe_context *pipe,
+                                  void *depth_stencil)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state *)depth_stencil;
+   failover->depth_stencil = state;
+   failover->dirty |= FO_NEW_DEPTH_STENCIL;
+   failover->sw->bind_depth_stencil_alpha_state(failover->sw, state->sw_state);
+   failover->hw->bind_depth_stencil_alpha_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_depth_stencil_state(struct pipe_context *pipe,
+                                    void *ds)
+{
+   struct fo_state *state = (struct fo_state*)ds;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_depth_stencil_alpha_state(failover->sw, state->sw_state);
+   failover->hw->delete_depth_stencil_alpha_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+static void
+failover_set_framebuffer_state(struct pipe_context *pipe,
+			       const struct pipe_framebuffer_state *framebuffer)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->framebuffer = *framebuffer;
+   failover->dirty |= FO_NEW_FRAMEBUFFER;
+   failover->sw->set_framebuffer_state( failover->sw, framebuffer );
+   failover->hw->set_framebuffer_state( failover->hw, framebuffer );
+}
+
+
+static void *
+failover_create_fs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_fs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_fs_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state*)fs;
+   failover->fragment_shader = state;
+   failover->dirty |= FO_NEW_FRAGMENT_SHADER;
+   failover->sw->bind_fs_state(failover->sw, state->sw_state);
+   failover->hw->bind_fs_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_fs_state(struct pipe_context *pipe,
+                         void *fs)
+{
+   struct fo_state *state = (struct fo_state*)fs;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_fs_state(failover->sw, state->sw_state);
+   failover->hw->delete_fs_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+static void *
+failover_create_vs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_vs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_vs_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_vs_state(struct pipe_context *pipe,
+                       void *vs)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   struct fo_state *state = (struct fo_state*)vs;
+   failover->vertex_shader = state;
+   failover->dirty |= FO_NEW_VERTEX_SHADER;
+   failover->sw->bind_vs_state(failover->sw, state->sw_state);
+   failover->hw->bind_vs_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_vs_state(struct pipe_context *pipe,
+                         void *vs)
+{
+   struct fo_state *state = (struct fo_state*)vs;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_vs_state(failover->sw, state->sw_state);
+   failover->hw->delete_vs_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+static void 
+failover_set_polygon_stipple( struct pipe_context *pipe,
+			      const struct pipe_poly_stipple *stipple )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->poly_stipple = *stipple;
+   failover->dirty |= FO_NEW_STIPPLE;
+   failover->sw->set_polygon_stipple( failover->sw, stipple );
+   failover->hw->set_polygon_stipple( failover->hw, stipple );
+}
+
+
+static void *
+failover_create_rasterizer_state(struct pipe_context *pipe,
+                                 const struct pipe_rasterizer_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_rasterizer_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_rasterizer_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_rasterizer_state(struct pipe_context *pipe,
+                               void *raster)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   struct fo_state *state = (struct fo_state*)raster;
+   failover->rasterizer = state;
+   failover->dirty |= FO_NEW_RASTERIZER;
+   failover->sw->bind_rasterizer_state(failover->sw, state->sw_state);
+   failover->hw->bind_rasterizer_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_rasterizer_state(struct pipe_context *pipe,
+                                 void *raster)
+{
+   struct fo_state *state = (struct fo_state*)raster;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_rasterizer_state(failover->sw, state->sw_state);
+   failover->hw->delete_rasterizer_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+
+static void 
+failover_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->scissor = *scissor;
+   failover->dirty |= FO_NEW_SCISSOR;
+   failover->sw->set_scissor_state( failover->sw, scissor );
+   failover->hw->set_scissor_state( failover->hw, scissor );
+}
+
+
+static void *
+failover_create_sampler_state(struct pipe_context *pipe,
+                              const struct pipe_sampler_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_sampler_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_sampler_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_sampler_states(struct pipe_context *pipe,
+                             unsigned num, void **sampler)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state*)sampler;
+   uint i;
+   assert(num <= PIPE_MAX_SAMPLERS);
+   /* Check for no-op */
+   if (num == failover->num_samplers &&
+       !memcmp(failover->sampler, sampler, num * sizeof(void *)))
+      return;
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      failover->sw_sampler_state[i] = i < num ? state[i].sw_state : NULL;
+      failover->hw_sampler_state[i] = i < num ? state[i].hw_state : NULL;
+   }
+   failover->dirty |= FO_NEW_SAMPLER;
+   failover->num_samplers = num;
+   failover->sw->bind_sampler_states(failover->sw, num,
+                                     failover->sw_sampler_state);
+   failover->hw->bind_sampler_states(failover->hw, num,
+                                     failover->hw_sampler_state);
+}
+
+static void
+failover_delete_sampler_state(struct pipe_context *pipe, void *sampler)
+{
+   struct fo_state *state = (struct fo_state*)sampler;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_sampler_state(failover->sw, state->sw_state);
+   failover->hw->delete_sampler_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+
+static void
+failover_set_sampler_textures(struct pipe_context *pipe,
+                              unsigned num,
+                              struct pipe_texture **texture)
+{
+   struct failover_context *failover = failover_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == failover->num_textures &&
+       !memcmp(failover->texture, texture, num * sizeof(struct pipe_texture *)))
+      return;
+   for (i = 0; i < num; i++)
+      pipe_texture_reference((struct pipe_texture **) &failover->texture[i],
+                             texture[i]);
+   for (i = num; i < failover->num_textures; i++)
+      pipe_texture_reference((struct pipe_texture **) &failover->texture[i],
+                             NULL);
+   failover->dirty |= FO_NEW_TEXTURE;
+   failover->num_textures = num;
+   failover->sw->set_sampler_textures( failover->sw, num, texture );
+   failover->hw->set_sampler_textures( failover->hw, num, texture );
+}
+
+
+static void 
+failover_set_viewport_state( struct pipe_context *pipe,
+			     const struct pipe_viewport_state *viewport )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->viewport = *viewport; 
+   failover->dirty |= FO_NEW_VIEWPORT;
+   failover->sw->set_viewport_state( failover->sw, viewport );
+   failover->hw->set_viewport_state( failover->hw, viewport );
+}
+
+
+static void
+failover_set_vertex_buffers(struct pipe_context *pipe,
+                            unsigned count,
+                            const struct pipe_vertex_buffer *vertex_buffers)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   memcpy(failover->vertex_buffers, vertex_buffers,
+          count * sizeof(vertex_buffers[0]));
+   failover->dirty |= FO_NEW_VERTEX_BUFFER;
+   failover->num_vertex_buffers = count;
+   failover->sw->set_vertex_buffers( failover->sw, count, vertex_buffers );
+   failover->hw->set_vertex_buffers( failover->hw, count, vertex_buffers );
+}
+
+
+static void
+failover_set_vertex_elements(struct pipe_context *pipe,
+                             unsigned count,
+                             const struct pipe_vertex_element *vertex_elements)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   memcpy(failover->vertex_elements, vertex_elements,
+          count * sizeof(vertex_elements[0]));
+
+   failover->dirty |= FO_NEW_VERTEX_ELEMENT;
+   failover->num_vertex_elements = count;
+   failover->sw->set_vertex_elements( failover->sw, count, vertex_elements );
+   failover->hw->set_vertex_elements( failover->hw, count, vertex_elements );
+}
+
+void
+failover_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   failover->sw->set_constant_buffer(failover->sw, shader, index, buf);
+   failover->hw->set_constant_buffer(failover->hw, shader, index, buf);
+}
+
+
+void
+failover_init_state_functions( struct failover_context *failover )
+{
+   failover->pipe.create_blend_state = failover_create_blend_state;
+   failover->pipe.bind_blend_state   = failover_bind_blend_state;
+   failover->pipe.delete_blend_state = failover_delete_blend_state;
+   failover->pipe.create_sampler_state = failover_create_sampler_state;
+   failover->pipe.bind_sampler_states  = failover_bind_sampler_states;
+   failover->pipe.delete_sampler_state = failover_delete_sampler_state;
+   failover->pipe.create_depth_stencil_alpha_state = failover_create_depth_stencil_state;
+   failover->pipe.bind_depth_stencil_alpha_state   = failover_bind_depth_stencil_state;
+   failover->pipe.delete_depth_stencil_alpha_state = failover_delete_depth_stencil_state;
+   failover->pipe.create_rasterizer_state = failover_create_rasterizer_state;
+   failover->pipe.bind_rasterizer_state = failover_bind_rasterizer_state;
+   failover->pipe.delete_rasterizer_state = failover_delete_rasterizer_state;
+   failover->pipe.create_fs_state = failover_create_fs_state;
+   failover->pipe.bind_fs_state   = failover_bind_fs_state;
+   failover->pipe.delete_fs_state = failover_delete_fs_state;
+   failover->pipe.create_vs_state = failover_create_vs_state;
+   failover->pipe.bind_vs_state   = failover_bind_vs_state;
+   failover->pipe.delete_vs_state = failover_delete_vs_state;
+
+   failover->pipe.set_blend_color = failover_set_blend_color;
+   failover->pipe.set_clip_state = failover_set_clip_state;
+   failover->pipe.set_framebuffer_state = failover_set_framebuffer_state;
+   failover->pipe.set_polygon_stipple = failover_set_polygon_stipple;
+   failover->pipe.set_scissor_state = failover_set_scissor_state;
+   failover->pipe.set_sampler_textures = failover_set_sampler_textures;
+   failover->pipe.set_viewport_state = failover_set_viewport_state;
+   failover->pipe.set_vertex_buffers = failover_set_vertex_buffers;
+   failover->pipe.set_vertex_elements = failover_set_vertex_elements;
+   failover->pipe.set_constant_buffer = failover_set_constant_buffer;
+}
diff --git a/src/gallium/drivers/failover/fo_state_emit.c b/src/gallium/drivers/failover/fo_state_emit.c
new file mode 100644
index 0000000000..bd4fce9d20
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_state_emit.c
@@ -0,0 +1,117 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "fo_context.h"
+
+/* This looks like a lot of work at the moment - we're keeping a
+ * duplicate copy of the state up-to-date.  
+ *
+ * This can change in two ways:
+ * - With constant state objects we would only need to save a pointer,
+ *     not the whole object.
+ * - By adding a callback in the state tracker to re-emit state.  The
+ *     state tracker knows the current state already and can re-emit it 
+ *     without additional complexity.
+ *
+ * This works as a proof-of-concept, but a final version will have
+ * lower overheads.
+ */
+
+
+/* Bring the software pipe uptodate with current state.
+ * 
+ * With constant state objects we would probably just send all state
+ * to both rasterizers all the time???
+ */
+void
+failover_state_emit( struct failover_context *failover )
+{
+   if (failover->dirty & FO_NEW_BLEND)
+      failover->sw->bind_blend_state( failover->sw,
+                                      failover->blend->sw_state );
+
+   if (failover->dirty & FO_NEW_BLEND_COLOR)
+      failover->sw->set_blend_color( failover->sw, &failover->blend_color );
+
+   if (failover->dirty & FO_NEW_CLIP)
+      failover->sw->set_clip_state( failover->sw, &failover->clip );
+
+   if (failover->dirty & FO_NEW_DEPTH_STENCIL)
+      failover->sw->bind_depth_stencil_alpha_state( failover->sw,
+						    failover->depth_stencil->sw_state );
+
+   if (failover->dirty & FO_NEW_FRAMEBUFFER)
+      failover->sw->set_framebuffer_state( failover->sw, &failover->framebuffer );
+
+   if (failover->dirty & FO_NEW_FRAGMENT_SHADER)
+      failover->sw->bind_fs_state( failover->sw,
+                                   failover->fragment_shader->sw_state );
+
+   if (failover->dirty & FO_NEW_VERTEX_SHADER)
+      failover->sw->bind_vs_state( failover->sw,
+                                   failover->vertex_shader->sw_state );
+
+   if (failover->dirty & FO_NEW_STIPPLE)
+      failover->sw->set_polygon_stipple( failover->sw, &failover->poly_stipple );
+
+   if (failover->dirty & FO_NEW_RASTERIZER)
+      failover->sw->bind_rasterizer_state( failover->sw,
+                                           failover->rasterizer->sw_state );
+
+   if (failover->dirty & FO_NEW_SCISSOR)
+      failover->sw->set_scissor_state( failover->sw, &failover->scissor );
+
+   if (failover->dirty & FO_NEW_VIEWPORT)
+      failover->sw->set_viewport_state( failover->sw, &failover->viewport );
+
+   if (failover->dirty & FO_NEW_SAMPLER) {
+      failover->sw->bind_sampler_states( failover->sw, failover->num_samplers,
+                                         failover->sw_sampler_state );
+   }
+
+   if (failover->dirty & FO_NEW_TEXTURE) {
+      failover->sw->set_sampler_textures( failover->sw, failover->num_textures, 
+                                          failover->texture );
+   }
+
+   if (failover->dirty & FO_NEW_VERTEX_BUFFER) {
+      failover->sw->set_vertex_buffers( failover->sw,
+                                        failover->num_vertex_buffers,
+                                        failover->vertex_buffers );
+   }
+
+   if (failover->dirty & FO_NEW_VERTEX_ELEMENT) {
+      failover->sw->set_vertex_elements( failover->sw,
+                                         failover->num_vertex_elements,
+                                         failover->vertex_elements );
+   }
+
+   failover->dirty = 0;
+}
diff --git a/src/gallium/drivers/failover/fo_winsys.h b/src/gallium/drivers/failover/fo_winsys.h
new file mode 100644
index 0000000000..a8ce997a1f
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_winsys.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef FO_WINSYS_H
+#define FO_WINSYS_H
+
+
+/* This is the interface that failover requires any window system
+ * hosting it to implement.  This is the only include file in failover
+ * which is public.
+ */
+
+
+struct pipe_context;
+
+
+struct pipe_context *failover_create( struct pipe_context *hw,
+				      struct pipe_context *sw );
+
+
+#endif /* FO_WINSYS_H */
diff --git a/src/gallium/drivers/i915simple/Makefile b/src/gallium/drivers/i915simple/Makefile
new file mode 100644
index 0000000000..41a61a0020
--- /dev/null
+++ b/src/gallium/drivers/i915simple/Makefile
@@ -0,0 +1,31 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i915simple
+
+C_SOURCES = \
+	i915_blit.c \
+	i915_clear.c \
+	i915_flush.c \
+	i915_context.c \
+	i915_context.c \
+	i915_debug.c \
+	i915_debug_fp.c \
+	i915_state.c \
+	i915_state_immediate.c \
+	i915_state_dynamic.c \
+	i915_state_derived.c \
+	i915_state_emit.c \
+	i915_state_sampler.c \
+	i915_screen.c \
+	i915_prim_emit.c \
+	i915_prim_vbuf.c \
+	i915_texture.c \
+	i915_fpc_emit.c \
+	i915_fpc_translate.c \
+	i915_surface.c 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/i915simple/SConscript b/src/gallium/drivers/i915simple/SConscript
new file mode 100644
index 0000000000..2366e1247f
--- /dev/null
+++ b/src/gallium/drivers/i915simple/SConscript
@@ -0,0 +1,29 @@
+Import('*')
+
+env = env.Clone()
+
+i915simple = env.ConvenienceLibrary(
+	target = 'i915simple',
+	source = [
+		'i915_blit.c',
+		'i915_clear.c',
+		'i915_context.c',
+		'i915_debug.c',
+		'i915_debug_fp.c',
+		'i915_flush.c',
+		'i915_fpc_emit.c',
+		'i915_fpc_translate.c',
+		'i915_prim_emit.c',
+		'i915_prim_vbuf.c',
+		'i915_screen.c',
+		'i915_state.c',
+		'i915_state_derived.c',
+		'i915_state_dynamic.c',
+		'i915_state_emit.c',
+		'i915_state_immediate.c',
+		'i915_state_sampler.c',
+		'i915_surface.c',
+		'i915_texture.c',
+	])
+
+Export('i915simple')
diff --git a/src/gallium/drivers/i915simple/i915_batch.h b/src/gallium/drivers/i915simple/i915_batch.h
new file mode 100644
index 0000000000..a433cf054d
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_batch.h
@@ -0,0 +1,116 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_BATCH_H
+#define I915_BATCH_H
+
+#include "i915_winsys.h"
+
+struct i915_batchbuffer
+{
+   struct pipe_buffer *buffer;
+   struct i915_winsys *winsys;
+
+   unsigned char *map;
+   unsigned char *ptr;
+
+   size_t size;
+   size_t actual_size;
+
+   size_t relocs;
+   size_t max_relocs;
+};
+
+static INLINE boolean
+i915_batchbuffer_check( struct i915_batchbuffer *batch,
+			size_t dwords,
+			size_t relocs )
+{
+   /** TODO JB: Check relocs */
+   return dwords * 4 <= batch->size - (batch->ptr - batch->map);
+}
+
+static INLINE size_t
+i915_batchbuffer_space( struct i915_batchbuffer *batch )
+{
+   return batch->size - (batch->ptr - batch->map);
+}
+
+static INLINE void
+i915_batchbuffer_dword( struct i915_batchbuffer *batch,
+			unsigned dword )
+{
+   if (i915_batchbuffer_space(batch) < 4)
+      return;
+
+   *(unsigned *)batch->ptr = dword;
+   batch->ptr += 4;
+}
+
+static INLINE void
+i915_batchbuffer_write( struct i915_batchbuffer *batch,
+			void *data,
+			size_t size )
+{
+   if (i915_batchbuffer_space(batch) < size)
+      return;
+
+   memcpy(data, batch->ptr, size);
+   batch->ptr += size;
+}
+
+static INLINE void
+i915_batchbuffer_reloc( struct i915_batchbuffer *batch,
+			struct pipe_buffer *buffer,
+			size_t flags,
+			size_t offset )
+{
+   batch->winsys->batch_reloc( batch->winsys, buffer, flags, offset );
+}
+
+static INLINE void
+i915_batchbuffer_flush( struct i915_batchbuffer *batch,
+			struct pipe_fence_handle **fence )
+{
+   batch->winsys->batch_flush( batch->winsys, fence );
+}
+
+#define BEGIN_BATCH( dwords, relocs ) \
+   (i915_batchbuffer_check( i915->batch, dwords, relocs ))
+
+#define OUT_BATCH( dword ) \
+   i915_batchbuffer_dword( i915->batch, dword )
+
+#define OUT_RELOC( buf, flags, delta ) \
+   i915_batchbuffer_reloc( i915->batch, buf, flags, delta )
+
+#define FLUSH_BATCH(fence) do {				\
+   i915->winsys->batch_flush( i915->winsys, fence );	\
+   i915->hardware_dirty = ~0;				\
+} while (0)
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_blit.c b/src/gallium/drivers/i915simple/i915_blit.c
new file mode 100644
index 0000000000..448a4708ce
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_blit.c
@@ -0,0 +1,159 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_blit.h"
+#include "i915_reg.h"
+#include "i915_batch.h"
+#include "i915_debug.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+void
+i915_fill_blit(struct i915_context *i915,
+	       unsigned cpp,
+	       unsigned short dst_pitch,
+	       struct pipe_buffer *dst_buffer,
+	       unsigned dst_offset,
+	       short x, short y, 
+	       short w, short h, 
+	       unsigned color)
+{
+   unsigned BR13, CMD;
+
+
+   I915_DBG(i915,
+       "%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+       __FUNCTION__,
+       dst_buffer, dst_pitch, dst_offset, x, y, w, h);
+
+   switch (cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (((int) dst_pitch) & 0xffff) |
+	 (0xF0 << 16) | (1 << 24);
+      CMD = XY_COLOR_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (((int) dst_pitch) & 0xffff) |
+	 (0xF0 << 16) | (1 << 24) | (1 << 25);
+      CMD = (XY_COLOR_BLT_CMD | XY_COLOR_BLT_WRITE_ALPHA |
+             XY_COLOR_BLT_WRITE_RGB);
+      break;
+   default:
+      return;
+   }
+
+   if (!BEGIN_BATCH(6, 1)) {
+      FLUSH_BATCH(NULL);
+      assert(BEGIN_BATCH(6, 1));
+   }
+   OUT_BATCH(CMD);
+   OUT_BATCH(BR13);
+   OUT_BATCH((y << 16) | x);
+   OUT_BATCH(((y + h) << 16) | (x + w));
+   OUT_RELOC( dst_buffer, I915_BUFFER_ACCESS_WRITE, dst_offset);
+   OUT_BATCH(color);
+   FLUSH_BATCH(NULL);
+}
+
+
+void
+i915_copy_blit( struct i915_context *i915,
+                  unsigned do_flip,
+                  unsigned cpp,
+                  unsigned short src_pitch,
+                  struct pipe_buffer *src_buffer,
+                  unsigned src_offset,
+                  unsigned short dst_pitch,
+                  struct pipe_buffer *dst_buffer,
+                  unsigned dst_offset,
+                  short src_x, short src_y,
+                  short dst_x, short dst_y, 
+		  short w, short h )
+{
+   unsigned CMD, BR13;
+   int dst_y2 = dst_y + h;
+   int dst_x2 = dst_x + w;
+
+
+   I915_DBG(i915,
+       "%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+       __FUNCTION__,
+       src_buffer, src_pitch, src_offset, src_x, src_y,
+       dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
+
+   switch (cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (((int) dst_pitch) & 0xffff) |
+	 (0xCC << 16) | (1 << 24);
+      CMD = XY_SRC_COPY_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (((int) dst_pitch) & 0xffff) |
+	 (0xCC << 16) | (1 << 24) | (1 << 25);
+      CMD =
+         (XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA |
+          XY_SRC_COPY_BLT_WRITE_RGB);
+      break;
+   default:
+      return;
+   }
+
+   if (dst_y2 < dst_y || 
+       dst_x2 < dst_x) {
+      return;
+   }
+
+   /* Hardware can handle negative pitches but loses the ability to do
+    * proper overlapping blits in that case.  We don't really have a
+    * need for either at this stage.
+    */
+   assert (dst_pitch > 0 && src_pitch > 0);
+
+
+   if (!BEGIN_BATCH(8, 2)) {
+      FLUSH_BATCH(NULL);
+      assert(BEGIN_BATCH(8, 2));
+   }
+   OUT_BATCH(CMD);
+   OUT_BATCH(BR13);
+   OUT_BATCH((dst_y << 16) | dst_x);
+   OUT_BATCH((dst_y2 << 16) | dst_x2);
+   OUT_RELOC(dst_buffer, I915_BUFFER_ACCESS_WRITE, dst_offset);
+   OUT_BATCH((src_y << 16) | src_x);
+   OUT_BATCH(((int) src_pitch & 0xffff));
+   OUT_RELOC(src_buffer, I915_BUFFER_ACCESS_READ, src_offset);
+   FLUSH_BATCH(NULL);
+}
+
+
diff --git a/src/gallium/drivers/i915simple/i915_blit.h b/src/gallium/drivers/i915simple/i915_blit.h
new file mode 100644
index 0000000000..0bb3453861
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_blit.h
@@ -0,0 +1,55 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_BLIT_H
+#define I915_BLIT_H
+
+#include "i915_context.h"
+
+extern void i915_copy_blit(struct i915_context *i915,
+                           unsigned do_flip,
+			   unsigned cpp,
+			   unsigned short src_pitch,
+			   struct pipe_buffer *src_buffer,
+			   unsigned src_offset,
+			   unsigned short dst_pitch,
+			   struct pipe_buffer *dst_buffer,
+			   unsigned dst_offset,
+			   short srcx, short srcy,
+			   short dstx, short dsty,
+			   short w, short h );
+
+extern void i915_fill_blit(struct i915_context *i915,
+			   unsigned cpp,
+			   unsigned short dst_pitch,
+			   struct pipe_buffer *dst_buffer,
+			   unsigned dst_offset,
+			   short x, short y,
+			   short w, short h, unsigned color);
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_clear.c b/src/gallium/drivers/i915simple/i915_clear.c
new file mode 100644
index 0000000000..8a2d3ca43f
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_clear.c
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "i915_context.h"
+#include "i915_state.h"
+
+
+/**
+ * Clear the given surface to the specified value.
+ * No masking, no scissor (clear entire buffer).
+ */
+void
+i915_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+   pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+   ps->status = PIPE_SURFACE_STATUS_DEFINED;
+}
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
new file mode 100644
index 0000000000..3e3a596884
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -0,0 +1,193 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_state.h"
+#include "i915_batch.h"
+#include "i915_texture.h"
+#include "i915_reg.h"
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "pipe/p_screen.h"
+
+
+static void i915_destroy( struct pipe_context *pipe )
+{
+   struct i915_context *i915 = i915_context( pipe );
+
+   draw_destroy( i915->draw );
+   
+   if(i915->winsys->destroy)
+      i915->winsys->destroy(i915->winsys);
+
+   FREE( i915 );
+}
+
+
+static boolean
+i915_draw_range_elements(struct pipe_context *pipe,
+			     struct pipe_buffer *indexBuffer,
+			     unsigned indexSize,
+			     unsigned min_index,
+			     unsigned max_index,
+			     unsigned prim, unsigned start, unsigned count)
+{
+   struct i915_context *i915 = i915_context( pipe );
+   struct draw_context *draw = i915->draw;
+   unsigned i;
+
+   if (i915->dirty)
+      i915_update_derived( i915 );
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < i915->num_vertex_buffers; i++) {
+      void *buf
+         = pipe_buffer_map(pipe->screen,
+                                    i915->vertex_buffer[i].buffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes
+         = pipe_buffer_map(pipe->screen, indexBuffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_element_buffer_range(draw, indexSize,
+					   min_index,
+					   max_index,
+					   mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+   }
+
+
+   draw_set_mapped_constant_buffer(draw,
+                                   i915->current.constants[PIPE_SHADER_VERTEX],
+                                   ( i915->current.num_user_constants[PIPE_SHADER_VERTEX] * 
+                                     4 * sizeof(float) ));
+
+   /* draw! */
+   draw_arrays(i915->draw, prim, start, count);
+
+   /*
+    * unmap vertex/index buffers
+    */
+   for (i = 0; i < i915->num_vertex_buffers; i++) {
+      pipe_buffer_unmap(pipe->screen, i915->vertex_buffer[i].buffer);
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+   }
+   if (indexBuffer) {
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
+      draw_set_mapped_element_buffer_range(draw, 0, start, start + count - 1, NULL);
+   }
+
+   return TRUE;
+}
+
+static boolean
+i915_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+   return i915_draw_range_elements( pipe, indexBuffer,
+					indexSize,
+					0, 0xffffffff,
+					prim, start, count );
+}
+
+static boolean i915_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+   return i915_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
+struct pipe_context *i915_create_context( struct pipe_screen *screen,
+                                          struct pipe_winsys *pipe_winsys,
+                                          struct i915_winsys *i915_winsys )
+{
+   struct i915_context *i915;
+
+   i915 = CALLOC_STRUCT(i915_context);
+   if (i915 == NULL)
+      return NULL;
+
+   i915->winsys = i915_winsys;
+   i915->pipe.winsys = pipe_winsys;
+   i915->pipe.screen = screen;
+
+   i915->pipe.destroy = i915_destroy;
+
+   i915->pipe.clear = i915_clear;
+
+
+   i915->pipe.draw_arrays = i915_draw_arrays;
+   i915->pipe.draw_elements = i915_draw_elements;
+   i915->pipe.draw_range_elements = i915_draw_range_elements;
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   i915->draw = draw_create();
+   assert(i915->draw);
+   if (!debug_get_bool_option("I915_NO_VBUF", FALSE)) {
+      draw_set_rasterize_stage(i915->draw, i915_draw_vbuf_stage(i915));
+   }
+   else {
+      draw_set_rasterize_stage(i915->draw, i915_draw_render_stage(i915));
+   }
+
+   i915_init_surface_functions(i915);
+   i915_init_state_functions(i915);
+   i915_init_flush_functions(i915);
+   i915_init_texture_functions(i915);
+
+   draw_install_aaline_stage(i915->draw, &i915->pipe);
+   draw_install_aapoint_stage(i915->draw, &i915->pipe);
+
+   i915->dirty = ~0;
+   i915->hardware_dirty = ~0;
+
+   /* Batch stream debugging is a bit hacked up at the moment:
+    */
+   i915->batch = i915_winsys->batch_get(i915_winsys);
+   i915->batch->winsys = i915_winsys;
+
+   return &i915->pipe;
+}
+
diff --git a/src/gallium/drivers/i915simple/i915_context.h b/src/gallium/drivers/i915simple/i915_context.h
new file mode 100644
index 0000000000..3cdabe45f9
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_context.h
@@ -0,0 +1,345 @@
+ /**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_CONTEXT_H
+#define I915_CONTEXT_H
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "draw/draw_vertex.h"
+
+#include "tgsi/tgsi_scan.h"
+
+
+#define I915_TEX_UNITS 8
+
+#define I915_DYNAMIC_MODES4       0
+#define I915_DYNAMIC_DEPTHSCALE_0 1 /* just the header */
+#define I915_DYNAMIC_DEPTHSCALE_1 2 
+#define I915_DYNAMIC_IAB          3
+#define I915_DYNAMIC_BC_0         4 /* just the header */
+#define I915_DYNAMIC_BC_1         5
+#define I915_DYNAMIC_BFO_0        6 
+#define I915_DYNAMIC_BFO_1        7
+#define I915_DYNAMIC_STP_0        8 
+#define I915_DYNAMIC_STP_1        9 
+#define I915_DYNAMIC_SC_ENA_0     10 
+#define I915_DYNAMIC_SC_RECT_0    11 
+#define I915_DYNAMIC_SC_RECT_1    12 
+#define I915_DYNAMIC_SC_RECT_2    13 
+#define I915_MAX_DYNAMIC          14
+
+
+#define I915_IMMEDIATE_S0         0
+#define I915_IMMEDIATE_S1         1
+#define I915_IMMEDIATE_S2         2
+#define I915_IMMEDIATE_S3         3
+#define I915_IMMEDIATE_S4         4
+#define I915_IMMEDIATE_S5         5
+#define I915_IMMEDIATE_S6         6
+#define I915_IMMEDIATE_S7         7
+#define I915_MAX_IMMEDIATE        8
+
+/* These must mach the order of LI0_STATE_* bits, as they will be used
+ * to generate hardware packets:
+ */
+#define I915_CACHE_STATIC         0 
+#define I915_CACHE_DYNAMIC        1 /* handled specially */
+#define I915_CACHE_SAMPLER        2
+#define I915_CACHE_MAP            3
+#define I915_CACHE_PROGRAM        4
+#define I915_CACHE_CONSTANTS      5
+#define I915_MAX_CACHE            6
+
+#define I915_MAX_CONSTANT  32
+
+
+/** See constant_flags[] below */
+#define I915_CONSTFLAG_USER 0x1f
+
+
+/**
+ * Subclass of pipe_shader_state
+ */
+struct i915_fragment_shader
+{
+   struct pipe_shader_state state;
+
+   struct tgsi_shader_info info;
+
+   uint *program;
+   uint program_len;
+
+   /**
+    * constants introduced during translation.
+    * These are placed at the end of the constant buffer and grow toward
+    * the beginning (eg: slot 31, 30 29, ...)
+    * User-provided constants start at 0.
+    * This allows both types of constants to co-exist (until there's too many)
+    * and doesn't require regenerating/changing the fragment program to
+    * shuffle constants around.
+    */
+   uint num_constants;
+   float constants[I915_MAX_CONSTANT][4];
+
+   /**
+    * Status of each constant
+    * if I915_CONSTFLAG_PARAM, the value must be taken from the corresponding
+    * slot of the user's constant buffer. (set by pipe->set_constant_buffer())
+    * Else, the bitmask indicates which components are occupied by immediates.
+    */
+   ubyte constant_flags[I915_MAX_CONSTANT];
+};
+
+
+struct i915_cache_context;
+
+/* Use to calculate differences between state emitted to hardware and
+ * current driver-calculated state.  
+ */
+struct i915_state 
+{
+   unsigned immediate[I915_MAX_IMMEDIATE];
+   unsigned dynamic[I915_MAX_DYNAMIC];
+
+   float constants[PIPE_SHADER_TYPES][I915_MAX_CONSTANT][4];
+   /** number of constants passed in through a constant buffer */
+   uint num_user_constants[PIPE_SHADER_TYPES];
+
+   /* texture sampler state */
+   unsigned sampler[I915_TEX_UNITS][3];
+   unsigned sampler_enable_flags;
+   unsigned sampler_enable_nr;
+
+   /* texture image buffers */
+   unsigned texbuffer[I915_TEX_UNITS][2];
+
+   /** Describes the current hardware vertex layout */
+   struct vertex_info vertex_info;
+   
+   unsigned id;			/* track lost context events */
+};
+
+struct i915_blend_state {
+   unsigned iab;
+   unsigned modes4;
+   unsigned LIS5;
+   unsigned LIS6;
+};
+
+struct i915_depth_stencil_state {
+   unsigned stencil_modes4;
+   unsigned bfo[2];
+   unsigned stencil_LIS5;
+   unsigned depth_LIS6;
+};
+
+struct i915_rasterizer_state {
+   int light_twoside : 1;
+   unsigned st;
+   enum interp_mode color_interp;
+
+   unsigned LIS4;
+   unsigned LIS7;
+   unsigned sc[1];
+
+   const struct pipe_rasterizer_state *templ;
+
+   union { float f; unsigned u; } ds[2];
+};
+
+struct i915_sampler_state {
+   unsigned state[3];
+   const struct pipe_sampler_state *templ;
+   unsigned minlod;
+   unsigned maxlod;
+};
+
+
+struct i915_texture {
+   struct pipe_texture base;
+
+   /* Derived from the above:
+    */
+   unsigned stride;
+   unsigned depth_stride;          /* per-image on i945? */
+   unsigned total_nblocksy;
+
+   unsigned tiled;
+
+   unsigned nr_images[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* Explicitly store the offset of each image for each cube face or
+    * depth value.  Pretty much have to accept that hardware formats
+    * are going to be so diverse that there is no unified way to
+    * compute the offsets of depth/cube images within a mipmap level,
+    * so have to store them as a lookup table:
+    */
+   unsigned *image_offset[PIPE_MAX_TEXTURE_LEVELS];   /**< array [depth] of offsets */
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+};
+
+struct i915_batchbuffer;
+
+struct i915_context
+{
+   struct pipe_context pipe;
+   struct i915_winsys *winsys;
+   struct draw_context *draw;
+
+   /* The most recent drawing state as set by the driver:
+    */
+   const struct i915_blend_state           *blend;
+   const struct i915_sampler_state         *sampler[PIPE_MAX_SAMPLERS];
+   const struct i915_depth_stencil_state   *depth_stencil;
+   const struct i915_rasterizer_state      *rasterizer;
+
+   struct i915_fragment_shader *fs;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct i915_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+
+   unsigned dirty;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+   unsigned num_vertex_elements;
+   unsigned num_vertex_buffers;
+
+   struct i915_batchbuffer *batch;
+
+   /** Vertex buffer */
+   struct pipe_buffer *vbo;
+   size_t vbo_offset;
+   unsigned vbo_flushed;
+
+   struct i915_state current;
+   unsigned hardware_dirty;
+   
+   unsigned debug;
+};
+
+/* A flag for each state_tracker state object:
+ */
+#define I915_NEW_VIEWPORT      0x1
+#define I915_NEW_RASTERIZER    0x2
+#define I915_NEW_FS            0x4
+#define I915_NEW_BLEND         0x8
+#define I915_NEW_CLIP          0x10
+#define I915_NEW_SCISSOR       0x20
+#define I915_NEW_STIPPLE       0x40
+#define I915_NEW_FRAMEBUFFER   0x80
+#define I915_NEW_ALPHA_TEST    0x100
+#define I915_NEW_DEPTH_STENCIL 0x200
+#define I915_NEW_SAMPLER       0x400
+#define I915_NEW_TEXTURE       0x800
+#define I915_NEW_CONSTANTS     0x1000
+#define I915_NEW_VBO           0x2000
+#define I915_NEW_VS            0x4000
+
+
+/* Driver's internally generated state flags:
+ */
+#define I915_NEW_VERTEX_FORMAT    0x10000
+
+
+/* Dirty flags for hardware emit
+ */
+#define I915_HW_STATIC            (1<<I915_CACHE_STATIC)
+#define I915_HW_DYNAMIC           (1<<I915_CACHE_DYNAMIC)
+#define I915_HW_SAMPLER           (1<<I915_CACHE_SAMPLER)
+#define I915_HW_MAP               (1<<I915_CACHE_MAP)
+#define I915_HW_PROGRAM           (1<<I915_CACHE_PROGRAM)
+#define I915_HW_CONSTANTS         (1<<I915_CACHE_CONSTANTS)
+#define I915_HW_IMMEDIATE         (1<<(I915_MAX_CACHE+0))
+#define I915_HW_INVARIENT         (1<<(I915_MAX_CACHE+1))
+
+
+/***********************************************************************
+ * i915_prim_emit.c: 
+ */
+struct draw_stage *i915_draw_render_stage( struct i915_context *i915 );
+
+
+/***********************************************************************
+ * i915_prim_vbuf.c: 
+ */
+struct draw_stage *i915_draw_vbuf_stage( struct i915_context *i915 );
+
+
+/***********************************************************************
+ * i915_state_emit.c: 
+ */
+void i915_emit_hardware_state(struct i915_context *i915 );
+
+
+
+/***********************************************************************
+ * i915_clear.c: 
+ */
+void i915_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		unsigned clearValue);
+
+
+/***********************************************************************
+ * i915_surface.c: 
+ */
+void i915_init_surface_functions( struct i915_context *i915 );
+
+void i915_init_state_functions( struct i915_context *i915 );
+void i915_init_flush_functions( struct i915_context *i915 );
+void i915_init_string_functions( struct i915_context *i915 );
+
+
+
+
+/***********************************************************************
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct i915_context *
+i915_context( struct pipe_context *pipe )
+{
+   return (struct i915_context *)pipe;
+}
+
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_debug.c b/src/gallium/drivers/i915simple/i915_debug.c
new file mode 100644
index 0000000000..a300b61c3b
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_debug.c
@@ -0,0 +1,899 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_debug.h"
+#include "i915_batch.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_debug.h"
+
+
+static void
+PRINTF(
+   struct debug_stream  *stream,
+   const char           *fmt,
+                        ... )
+{
+   va_list  args;
+
+   va_start( args, fmt );
+   debug_vprintf( fmt, args );
+   va_end( args );
+}
+
+
+static boolean debug( struct debug_stream *stream, const char *name, unsigned len )
+{
+   unsigned i;
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   
+   if (len == 0) {
+      PRINTF(stream, "Error - zero length packet (0x%08x)\n", stream->ptr[0]);
+      assert(0);
+      return FALSE;
+   }
+
+   if (stream->print_addresses)
+      PRINTF(stream, "%08x:  ", stream->offset);
+
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   for (i = 0; i < len; i++)
+      PRINTF(stream, "\t0x%08x\n",  ptr[i]);   
+   PRINTF(stream, "\n");
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+
+
+static const char *get_prim_name( unsigned val )
+{
+   switch (val & PRIM3D_MASK) {
+   case PRIM3D_TRILIST: return "TRILIST"; break;
+   case PRIM3D_TRISTRIP: return "TRISTRIP"; break;
+   case PRIM3D_TRISTRIP_RVRSE: return "TRISTRIP_RVRSE"; break;
+   case PRIM3D_TRIFAN: return "TRIFAN"; break;
+   case PRIM3D_POLY: return "POLY"; break;
+   case PRIM3D_LINELIST: return "LINELIST"; break;
+   case PRIM3D_LINESTRIP: return "LINESTRIP"; break;
+   case PRIM3D_RECTLIST: return "RECTLIST"; break;
+   case PRIM3D_POINTLIST: return "POINTLIST"; break;
+   case PRIM3D_DIB: return "DIB"; break;
+   case PRIM3D_CLEAR_RECT: return "CLEAR_RECT"; break;
+   case PRIM3D_ZONE_INIT: return "ZONE_INIT"; break;
+   default: return "????"; break;
+   }
+}
+
+static boolean debug_prim( struct debug_stream *stream, const char *name, 
+			     boolean dump_floats,
+			     unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   const char *prim = get_prim_name( ptr[0] );
+   unsigned i;
+   
+
+
+   PRINTF(stream, "%s %s (%d dwords):\n", name, prim, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[0]);   
+   for (i = 1; i < len; i++) {
+      if (dump_floats)
+	 PRINTF(stream, "\t0x%08x // %f\n",  ptr[i], *(float *)&ptr[i]);   
+      else
+	 PRINTF(stream, "\t0x%08x\n",  ptr[i]);   
+   }
+
+      
+   PRINTF(stream, "\n");
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+   
+
+
+
+static boolean debug_program( struct debug_stream *stream, const char *name, unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+
+   if (len == 0) {
+      PRINTF(stream, "Error - zero length packet (0x%08x)\n", stream->ptr[0]);
+      assert(0);
+      return FALSE;
+   }
+
+   if (stream->print_addresses)
+      PRINTF(stream, "%08x:  ", stream->offset);
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   i915_disassemble_program( stream, ptr, len );
+
+   stream->offset += len * sizeof(unsigned);
+   return TRUE;
+}
+
+
+static boolean debug_chain( struct debug_stream *stream, const char *name, unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned old_offset = stream->offset + len * sizeof(unsigned);
+   unsigned i;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   for (i = 0; i < len; i++)
+      PRINTF(stream, "\t0x%08x\n",  ptr[i]);
+
+   stream->offset = ptr[1] & ~0x3;
+   
+   if (stream->offset < old_offset)
+      PRINTF(stream, "\n... skipping backwards from 0x%x --> 0x%x ...\n\n", 
+		   old_offset, stream->offset );
+   else
+      PRINTF(stream, "\n... skipping from 0x%x --> 0x%x ...\n\n", 
+		   old_offset, stream->offset );
+
+
+   return TRUE;
+}
+
+
+static boolean debug_variable_length_prim( struct debug_stream *stream )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   const char *prim = get_prim_name( ptr[0] );
+   unsigned i, len;
+
+   ushort *idx = (ushort *)(ptr+1);
+   for (i = 0; idx[i] != 0xffff; i++)
+      ;
+
+   len = 1+(i+2)/2;
+
+   PRINTF(stream, "3DPRIM, %s variable length %d indicies (%d dwords):\n", prim, i, len);
+   for (i = 0; i < len; i++)
+      PRINTF(stream, "\t0x%08x\n",  ptr[i]);
+   PRINTF(stream, "\n");
+
+   stream->offset += len * sizeof(unsigned);
+   return TRUE;
+}
+
+
+static void
+BITS(
+   struct debug_stream  *stream,
+   unsigned             dw,
+   unsigned             hi,
+   unsigned             lo,
+   const char           *fmt,
+                        ... )
+{
+   va_list  args;
+   unsigned himask = ~0UL >> (31 - (hi));
+
+   PRINTF(stream, "\t\t ");
+
+   va_start( args, fmt );
+   debug_vprintf( fmt, args );
+   va_end( args );
+
+   PRINTF(stream, ": 0x%x\n", ((dw) & himask) >> (lo));
+}
+
+#ifdef DEBUG
+#define MBZ( dw, hi, lo) do {							\
+   unsigned x = (dw) >> (lo);				\
+   unsigned lomask = (1 << (lo)) - 1;			\
+   unsigned himask;					\
+   himask = (1UL << (hi)) - 1;				\
+   assert ((x & himask & ~lomask) == 0);	\
+} while (0)
+#else
+#define MBZ( dw, hi, lo) do {							\
+} while (0)
+#endif
+
+static void
+FLAG(
+   struct debug_stream  *stream,
+   unsigned             dw,
+   unsigned             bit,
+   const char           *fmt,
+                        ... )
+{
+   if (((dw) >> (bit)) & 1) {
+      va_list  args;
+
+      PRINTF(stream, "\t\t ");
+
+      va_start( args, fmt );
+      debug_vprintf( fmt, args );
+      va_end( args );
+
+      PRINTF(stream, "\n");
+   }
+}
+
+static boolean debug_load_immediate( struct debug_stream *stream,
+				       const char *name,
+				       unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned bits = (ptr[0] >> 4) & 0xff;
+   unsigned j = 0;
+   
+   PRINTF(stream, "%s (%d dwords, flags: %x):\n", name, len, bits);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   if (bits & (1<<0)) {
+      PRINTF(stream, "\t  LIS0: 0x%08x\n", ptr[j]);
+      PRINTF(stream, "\t vb address: 0x%08x\n", (ptr[j] & ~0x3));
+      BITS(stream, ptr[j], 0, 0, "vb invalidate disable");
+      j++;
+   }
+   if (bits & (1<<1)) {
+      PRINTF(stream, "\t  LIS1: 0x%08x\n", ptr[j]);
+      BITS(stream, ptr[j], 29, 24, "vb dword width");
+      BITS(stream, ptr[j], 21, 16, "vb dword pitch");
+      BITS(stream, ptr[j], 15, 0, "vb max index");
+      j++;
+   }
+   if (bits & (1<<2)) {
+      int i;
+      PRINTF(stream, "\t  LIS2: 0x%08x\n", ptr[j]);
+      for (i = 0; i < 8; i++) {
+	 unsigned tc = (ptr[j] >> (i * 4)) & 0xf;
+	 if (tc != 0xf)
+	    BITS(stream, tc, 3, 0, "tex coord %d", i);
+      }
+      j++;
+   }
+   if (bits & (1<<3)) {
+      PRINTF(stream, "\t  LIS3: 0x%08x\n", ptr[j]);
+      j++;
+   }
+   if (bits & (1<<4)) {
+      PRINTF(stream, "\t  LIS4: 0x%08x\n", ptr[j]);
+      BITS(stream, ptr[j], 31, 23, "point width");
+      BITS(stream, ptr[j], 22, 19, "line width");
+      FLAG(stream, ptr[j], 18, "alpha flatshade");
+      FLAG(stream, ptr[j], 17, "fog flatshade");
+      FLAG(stream, ptr[j], 16, "spec flatshade");
+      FLAG(stream, ptr[j], 15, "rgb flatshade");
+      BITS(stream, ptr[j], 14, 13, "cull mode");
+      FLAG(stream, ptr[j], 12, "vfmt: point width");
+      FLAG(stream, ptr[j], 11, "vfmt: specular/fog");
+      FLAG(stream, ptr[j], 10, "vfmt: rgba");
+      FLAG(stream, ptr[j], 9, "vfmt: depth offset");
+      BITS(stream, ptr[j], 8, 6, "vfmt: position (2==xyzw)");
+      FLAG(stream, ptr[j], 5, "force dflt diffuse");
+      FLAG(stream, ptr[j], 4, "force dflt specular");
+      FLAG(stream, ptr[j], 3, "local depth offset enable");
+      FLAG(stream, ptr[j], 2, "vfmt: fp32 fog coord");
+      FLAG(stream, ptr[j], 1, "sprite point");
+      FLAG(stream, ptr[j], 0, "antialiasing");
+      j++;
+   }
+   if (bits & (1<<5)) {
+      PRINTF(stream, "\t  LIS5: 0x%08x\n", ptr[j]);
+      BITS(stream, ptr[j], 31, 28, "rgba write disables");
+      FLAG(stream, ptr[j], 27,     "force dflt point width");
+      FLAG(stream, ptr[j], 26,     "last pixel enable");
+      FLAG(stream, ptr[j], 25,     "global z offset enable");
+      FLAG(stream, ptr[j], 24,     "fog enable");
+      BITS(stream, ptr[j], 23, 16, "stencil ref");
+      BITS(stream, ptr[j], 15, 13, "stencil test");
+      BITS(stream, ptr[j], 12, 10, "stencil fail op");
+      BITS(stream, ptr[j], 9, 7,   "stencil pass z fail op");
+      BITS(stream, ptr[j], 6, 4,   "stencil pass z pass op");
+      FLAG(stream, ptr[j], 3,      "stencil write enable");
+      FLAG(stream, ptr[j], 2,      "stencil test enable");
+      FLAG(stream, ptr[j], 1,      "color dither enable");
+      FLAG(stream, ptr[j], 0,      "logiop enable");
+      j++;
+   }
+   if (bits & (1<<6)) {
+      PRINTF(stream, "\t  LIS6: 0x%08x\n", ptr[j]);
+      FLAG(stream, ptr[j], 31,      "alpha test enable");
+      BITS(stream, ptr[j], 30, 28,  "alpha func");
+      BITS(stream, ptr[j], 27, 20,  "alpha ref");
+      FLAG(stream, ptr[j], 19,      "depth test enable");
+      BITS(stream, ptr[j], 18, 16,  "depth func");
+      FLAG(stream, ptr[j], 15,      "blend enable");
+      BITS(stream, ptr[j], 14, 12,  "blend func");
+      BITS(stream, ptr[j], 11, 8,   "blend src factor");
+      BITS(stream, ptr[j], 7,  4,   "blend dst factor");
+      FLAG(stream, ptr[j], 3,       "depth write enable");
+      FLAG(stream, ptr[j], 2,       "color write enable");
+      BITS(stream, ptr[j], 1,  0,   "provoking vertex"); 
+      j++;
+   }
+
+
+   PRINTF(stream, "\n");
+
+   assert(j == len);
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+ 
+
+
+static boolean debug_load_indirect( struct debug_stream *stream,
+				      const char *name,
+				      unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned bits = (ptr[0] >> 8) & 0x3f;
+   unsigned i, j = 0;
+   
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   for (i = 0; i < 6; i++) {
+      if (bits & (1<<i)) {
+	 switch (1<<(8+i)) {
+	 case LI0_STATE_STATIC_INDIRECT:
+	    PRINTF(stream, "        STATIC: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_DYNAMIC_INDIRECT:
+	    PRINTF(stream, "       DYNAMIC: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    break;
+	 case LI0_STATE_SAMPLER:
+	    PRINTF(stream, "       SAMPLER: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_MAP:
+	    PRINTF(stream, "           MAP: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_PROGRAM:
+	    PRINTF(stream, "       PROGRAM: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_CONSTANTS:
+	    PRINTF(stream, "     CONSTANTS: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 default:
+	    assert(0);
+	    break;
+	 }
+      }
+   }
+
+   if (bits == 0) {
+      PRINTF(stream, "\t  DUMMY: 0x%08x\n", ptr[j++]);
+   }
+
+   PRINTF(stream, "\n");
+
+
+   assert(j == len);
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+ 	
+static void BR13( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   FLAG(stream, val, 30, "clipping enable");
+   BITS(stream, val, 25, 24, "color depth (3==32bpp)");
+   BITS(stream, val, 23, 16, "raster op");
+   BITS(stream, val, 15, 0,  "dest pitch");
+}
+
+
+static void BR22( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 31, 16, "dest y1");
+   BITS(stream, val, 15, 0,  "dest x1");
+}
+
+static void BR23( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 31, 16, "dest y2");
+   BITS(stream, val, 15, 0,  "dest x2");
+}
+
+static void BR09( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x -- dest address\n",  val);
+}
+
+static void BR26( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 31, 16, "src y1");
+   BITS(stream, val, 15, 0,  "src x1");
+}
+
+static void BR11( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 15, 0,  "src pitch");
+}
+
+static void BR12( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x -- src address\n",  val);
+}
+
+static void BR16( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x -- color\n",  val);
+}
+   
+static boolean debug_copy_blit( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+   
+   BR13(stream, ptr[j++]);
+   BR22(stream, ptr[j++]);
+   BR23(stream, ptr[j++]);
+   BR09(stream, ptr[j++]);
+   BR26(stream, ptr[j++]);
+   BR11(stream, ptr[j++]);
+   BR12(stream, ptr[j++]);
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_color_blit( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   BR13(stream, ptr[j++]);
+   BR22(stream, ptr[j++]);
+   BR23(stream, ptr[j++]);
+   BR09(stream, ptr[j++]);
+   BR16(stream, ptr[j++]);
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_modes4( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+   BITS(stream, ptr[j], 21, 18, "logicop func");
+   FLAG(stream, ptr[j], 17, "stencil test mask modify-enable");
+   FLAG(stream, ptr[j], 16, "stencil write mask modify-enable");
+   BITS(stream, ptr[j], 15, 8, "stencil test mask");
+   BITS(stream, ptr[j], 7, 0,  "stencil write mask");
+   j++;
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_map_state( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+   
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      BITS(stream, ptr[j], 15, 0,   "map mask");
+      j++;
+   }
+
+   while (j < len) {
+      {
+	 PRINTF(stream, "\t  TMn.0: 0x%08x\n", ptr[j]);
+	 PRINTF(stream, "\t map address: 0x%08x\n", (ptr[j] & ~0x3));
+	 FLAG(stream, ptr[j], 1, "vertical line stride");
+	 FLAG(stream, ptr[j], 0, "vertical line stride offset");
+	 j++;
+      }
+
+      {
+	 PRINTF(stream, "\t  TMn.1: 0x%08x\n", ptr[j]);
+	 BITS(stream, ptr[j], 31, 21, "height");
+	 BITS(stream, ptr[j], 20, 10, "width");
+	 BITS(stream, ptr[j], 9, 7, "surface format");
+	 BITS(stream, ptr[j], 6, 3, "texel format");
+	 FLAG(stream, ptr[j], 2, "use fence regs");
+	 FLAG(stream, ptr[j], 1, "tiled surface");
+	 FLAG(stream, ptr[j], 0, "tile walk ymajor");
+	 j++;
+      }
+      {
+	 PRINTF(stream, "\t  TMn.2: 0x%08x\n", ptr[j]);
+	 BITS(stream, ptr[j], 31, 21, "dword pitch");
+	 BITS(stream, ptr[j], 20, 15, "cube face enables");
+	 BITS(stream, ptr[j], 14, 9, "max lod");
+	 FLAG(stream, ptr[j], 8,     "mip layout right");
+	 BITS(stream, ptr[j], 7, 0, "depth");
+	 j++;
+      }
+   }
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_sampler_state( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+   
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      BITS(stream, ptr[j], 15, 0,   "sampler mask");
+      j++;
+   }
+
+   while (j < len) {
+      {
+	 PRINTF(stream, "\t  TSn.0: 0x%08x\n", ptr[j]);
+	 FLAG(stream, ptr[j], 31, "reverse gamma");
+	 FLAG(stream, ptr[j], 30, "planar to packed");
+	 FLAG(stream, ptr[j], 29, "yuv->rgb");
+	 BITS(stream, ptr[j], 28, 27, "chromakey index");
+	 BITS(stream, ptr[j], 26, 22, "base mip level");
+	 BITS(stream, ptr[j], 21, 20, "mip mode filter");
+	 BITS(stream, ptr[j], 19, 17, "mag mode filter");
+	 BITS(stream, ptr[j], 16, 14, "min mode filter");
+	 BITS(stream, ptr[j], 13, 5,  "lod bias (s4.4)");
+	 FLAG(stream, ptr[j], 4,      "shadow enable");
+	 FLAG(stream, ptr[j], 3,      "max-aniso-4");
+	 BITS(stream, ptr[j], 2, 0,   "shadow func");
+	 j++;
+      }
+
+      {
+	 PRINTF(stream, "\t  TSn.1: 0x%08x\n", ptr[j]);
+	 BITS(stream, ptr[j], 31, 24, "min lod");
+	 MBZ( ptr[j], 23, 18 );
+	 FLAG(stream, ptr[j], 17,     "kill pixel enable");
+	 FLAG(stream, ptr[j], 16,     "keyed tex filter mode");
+	 FLAG(stream, ptr[j], 15,     "chromakey enable");
+	 BITS(stream, ptr[j], 14, 12, "tcx wrap mode");
+	 BITS(stream, ptr[j], 11, 9,  "tcy wrap mode");
+	 BITS(stream, ptr[j], 8,  6,  "tcz wrap mode");
+	 FLAG(stream, ptr[j], 5,      "normalized coords");
+	 BITS(stream, ptr[j], 4,  1,  "map (surface) index");
+	 FLAG(stream, ptr[j], 0,      "EAST deinterlacer enable");
+	 j++;
+      }
+      {
+	 PRINTF(stream, "\t  TSn.2: 0x%08x  (default color)\n", ptr[j]);
+	 j++;
+      }
+   }
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_dest_vars( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      FLAG(stream, ptr[j], 31,     "early classic ztest");
+      FLAG(stream, ptr[j], 30,     "opengl tex default color");
+      FLAG(stream, ptr[j], 29,     "bypass iz");
+      FLAG(stream, ptr[j], 28,     "lod preclamp");
+      BITS(stream, ptr[j], 27, 26, "dither pattern");
+      FLAG(stream, ptr[j], 25,     "linear gamma blend");
+      FLAG(stream, ptr[j], 24,     "debug dither");
+      BITS(stream, ptr[j], 23, 20, "dstorg x");
+      BITS(stream, ptr[j], 19, 16, "dstorg y");
+      MBZ (ptr[j], 15, 15 );
+      BITS(stream, ptr[j], 14, 12, "422 write select");
+      BITS(stream, ptr[j], 11, 8,  "cbuf format");
+      BITS(stream, ptr[j], 3, 2,   "zbuf format");
+      FLAG(stream, ptr[j], 1,      "vert line stride");
+      FLAG(stream, ptr[j], 1,      "vert line stride offset");
+      j++;
+   }
+   
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_buf_info( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      BITS(stream, ptr[j], 28, 28, "aux buffer id");
+      BITS(stream, ptr[j], 27, 24, "buffer id (7=depth, 3=back)");
+      FLAG(stream, ptr[j], 23,     "use fence regs");
+      FLAG(stream, ptr[j], 22,     "tiled surface");
+      FLAG(stream, ptr[j], 21,     "tile walk ymajor");
+      MBZ (ptr[j], 20, 14);
+      BITS(stream, ptr[j], 13, 2,  "dword pitch");
+      MBZ (ptr[j], 2,  0);
+      j++;
+   }
+   
+   PRINTF(stream, "\t0x%08x -- buffer base address\n",  ptr[j++]);
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean i915_debug_packet( struct debug_stream *stream )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned cmd = *ptr;
+   
+   switch (((cmd >> 29) & 0x7)) {
+   case 0x0:
+      switch ((cmd >> 23) & 0x3f) {
+      case 0x0:
+	 return debug(stream, "MI_NOOP", 1);
+      case 0x3:
+	 return debug(stream, "MI_WAIT_FOR_EVENT", 1);
+      case 0x4:
+	 return debug(stream, "MI_FLUSH", 1);
+      case 0xA:
+	 debug(stream, "MI_BATCH_BUFFER_END", 1);
+	 return FALSE;
+      case 0x22:
+	 return debug(stream, "MI_LOAD_REGISTER_IMM", 3);
+      case 0x31:
+	 return debug_chain(stream, "MI_BATCH_BUFFER_START", 2);
+      default:
+         (void)debug(stream, "UNKNOWN 0x0 case!", 1);
+         assert(0);
+	 break;
+      }
+      break;
+   case 0x1:
+      (void) debug(stream, "UNKNOWN 0x1 case!", 1);
+      assert(0);
+      break;
+   case 0x2:
+      switch ((cmd >> 22) & 0xff) {	 
+      case 0x50:
+	 return debug_color_blit(stream, "XY_COLOR_BLT", (cmd & 0xff) + 2);
+      case 0x53:
+	 return debug_copy_blit(stream, "XY_SRC_COPY_BLT", (cmd & 0xff) + 2);
+      default:
+	 return debug(stream, "blit command", (cmd & 0xff) + 2);
+      }
+      break;
+   case 0x3:
+      switch ((cmd >> 24) & 0x1f) {	 
+      case 0x6:
+	 return debug(stream, "3DSTATE_ANTI_ALIASING", 1);
+      case 0x7:
+	 return debug(stream, "3DSTATE_RASTERIZATION_RULES", 1);
+      case 0x8:
+	 return debug(stream, "3DSTATE_BACKFACE_STENCIL_OPS", 2);
+      case 0x9:
+	 return debug(stream, "3DSTATE_BACKFACE_STENCIL_MASKS", 1);
+      case 0xb:
+	 return debug(stream, "3DSTATE_INDEPENDENT_ALPHA_BLEND", 1);
+      case 0xc:
+	 return debug(stream, "3DSTATE_MODES5", 1);	 
+      case 0xd:
+	 return debug_modes4(stream, "3DSTATE_MODES4", 1);
+      case 0x15:
+	 return debug(stream, "3DSTATE_FOG_COLOR", 1);
+      case 0x16:
+	 return debug(stream, "3DSTATE_COORD_SET_BINDINGS", 1);
+      case 0x1c:
+	 /* 3DState16NP */
+	 switch((cmd >> 19) & 0x1f) {
+	 case 0x10:
+	    return debug(stream, "3DSTATE_SCISSOR_ENABLE", 1);
+	 case 0x11:
+	    return debug(stream, "3DSTATE_DEPTH_SUBRECTANGLE_DISABLE", 1);
+	 default:
+            (void) debug(stream, "UNKNOWN 0x1c case!", 1);
+            assert(0);
+	    break;
+	 }
+	 break;
+      case 0x1d:
+	 /* 3DStateMW */
+	 switch ((cmd >> 16) & 0xff) {
+	 case 0x0:
+	    return debug_map_state(stream, "3DSTATE_MAP_STATE", (cmd & 0x1f) + 2);
+	 case 0x1:
+	    return debug_sampler_state(stream, "3DSTATE_SAMPLER_STATE", (cmd & 0x1f) + 2);
+	 case 0x4:
+	    return debug_load_immediate(stream, "3DSTATE_LOAD_STATE_IMMEDIATE", (cmd & 0xf) + 2);
+	 case 0x5:
+	    return debug_program(stream, "3DSTATE_PIXEL_SHADER_PROGRAM", (cmd & 0x1ff) + 2);
+	 case 0x6:
+	    return debug(stream, "3DSTATE_PIXEL_SHADER_CONSTANTS", (cmd & 0xff) + 2);
+	 case 0x7:
+	    return debug_load_indirect(stream, "3DSTATE_LOAD_INDIRECT", (cmd & 0xff) + 2);
+	 case 0x80:
+	    return debug(stream, "3DSTATE_DRAWING_RECTANGLE", (cmd & 0xffff) + 2);
+	 case 0x81:
+	    return debug(stream, "3DSTATE_SCISSOR_RECTANGLE", (cmd & 0xffff) + 2);
+	 case 0x83:
+	    return debug(stream, "3DSTATE_SPAN_STIPPLE", (cmd & 0xffff) + 2);
+	 case 0x85:
+	    return debug_dest_vars(stream, "3DSTATE_DEST_BUFFER_VARS", (cmd & 0xffff) + 2);
+	 case 0x88:
+	    return debug(stream, "3DSTATE_CONSTANT_BLEND_COLOR", (cmd & 0xffff) + 2);
+	 case 0x89:
+	    return debug(stream, "3DSTATE_FOG_MODE", (cmd & 0xffff) + 2);
+	 case 0x8e:
+	    return debug_buf_info(stream, "3DSTATE_BUFFER_INFO", (cmd & 0xffff) + 2);
+	 case 0x97:
+	    return debug(stream, "3DSTATE_DEPTH_OFFSET_SCALE", (cmd & 0xffff) + 2);
+	 case 0x98:
+	    return debug(stream, "3DSTATE_DEFAULT_Z", (cmd & 0xffff) + 2);
+	 case 0x99:
+	    return debug(stream, "3DSTATE_DEFAULT_DIFFUSE", (cmd & 0xffff) + 2);
+	 case 0x9a:
+	    return debug(stream, "3DSTATE_DEFAULT_SPECULAR", (cmd & 0xffff) + 2);
+	 case 0x9c:
+	    return debug(stream, "3DSTATE_CLEAR_PARAMETERS", (cmd & 0xffff) + 2);
+	 default:
+	    assert(0);
+	    return 0;
+	 }
+	 break;
+      case 0x1e:
+	 if (cmd & (1 << 23))
+	    return debug(stream, "???", (cmd & 0xffff) + 1);
+	 else
+	    return debug(stream, "", 1);
+	 break;
+      case 0x1f:
+	 if ((cmd & (1 << 23)) == 0)	
+	    return debug_prim(stream, "3DPRIM (inline)", 1, (cmd & 0x1ffff) + 2);
+	 else if (cmd & (1 << 17)) 
+	 {
+	    if ((cmd & 0xffff) == 0)
+	       return debug_variable_length_prim(stream);
+	    else
+	       return debug_prim(stream, "3DPRIM (indexed)", 0, (((cmd & 0xffff) + 1) / 2) + 1);
+	 }
+	 else
+	    return debug_prim(stream, "3DPRIM  (indirect sequential)", 0, 2); 
+	 break;
+      default:
+	 return debug(stream, "", 0);
+      }
+   default:
+      assert(0);
+      return 0;
+   }
+
+   assert(0);
+   return 0;
+}
+
+
+
+void
+i915_dump_batchbuffer( struct i915_batchbuffer *batch )
+{
+   struct debug_stream stream;
+   unsigned *start = (unsigned*)batch->map;
+   unsigned *end = (unsigned*)batch->ptr;
+   unsigned long bytes = (unsigned long) (end - start) * 4;
+   boolean done = FALSE;
+
+   stream.offset = 0;
+   stream.ptr = (char *)start;
+   stream.print_addresses = 0;
+
+   if (!start || !end) {
+      debug_printf( "\n\nBATCH: ???\n");
+      return;
+   }
+   
+   debug_printf( "\n\nBATCH: (%d)\n", bytes / 4);
+
+   while (!done &&
+	  stream.offset < bytes)
+   {
+      if (!i915_debug_packet( &stream ))
+	 break;
+
+      assert(stream.offset <= bytes &&
+	     stream.offset >= 0);
+   }
+
+   debug_printf( "END-BATCH\n\n\n");
+}
+
+
diff --git a/src/gallium/drivers/i915simple/i915_debug.h b/src/gallium/drivers/i915simple/i915_debug.h
new file mode 100644
index 0000000000..16ca7277c7
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_debug.h
@@ -0,0 +1,114 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef I915_DEBUG_H
+#define I915_DEBUG_H
+
+#include <stdarg.h>
+
+struct i915_context;
+
+struct debug_stream 
+{
+   unsigned offset;		/* current gtt offset */
+   char *ptr;		/* pointer to gtt offset zero */
+   char *end;		/* pointer to gtt offset zero */
+   unsigned print_addresses;
+};
+
+
+/* Internal functions
+ */
+void i915_disassemble_program(struct debug_stream *stream, 
+			      const unsigned *program, unsigned sz);
+
+void i915_print_ureg(const char *msg, unsigned ureg);
+
+
+#define DEBUG_BATCH	 0x1
+#define DEBUG_BLIT       0x2
+#define DEBUG_BUFFER     0x4
+#define DEBUG_CONSTANTS  0x8
+#define DEBUG_CONTEXT    0x10
+#define DEBUG_DRAW	 0x20
+#define DEBUG_DYNAMIC	 0x40
+#define DEBUG_FLUSH      0x80
+#define DEBUG_MAP	 0x100
+#define DEBUG_PROGRAM	 0x200
+#define DEBUG_REGIONS    0x400
+#define DEBUG_SAMPLER	 0x800
+#define DEBUG_STATIC	 0x1000
+#define DEBUG_SURFACE    0x2000
+#define DEBUG_WINSYS     0x4000
+
+#include "pipe/p_compiler.h"
+
+#if defined(DEBUG) && defined(FILE_DEBUG_FLAG)
+
+#include "pipe/internal/p_winsys_screen.h"
+
+static INLINE void
+I915_DBG(
+   struct i915_context  *i915,
+   const char           *fmt,
+                        ... )
+{
+   if ((i915)->debug & FILE_DEBUG_FLAG) {
+      va_list  args;
+
+      va_start( args, fmt );
+      debug_vprintf( fmt, args );
+      va_end( args );
+   }
+}
+
+#else
+
+static INLINE void
+I915_DBG(
+   struct i915_context  *i915,
+   const char           *fmt,
+                        ... )
+{
+   (void) i915;
+   (void) fmt;
+}
+
+#endif
+
+
+struct i915_batchbuffer;
+
+void i915_dump_batchbuffer( struct i915_batchbuffer *i915 );
+
+void i915_debug_init( struct i915_context *i915 );
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_debug_fp.c b/src/gallium/drivers/i915simple/i915_debug_fp.c
new file mode 100644
index 0000000000..9c5b117b6d
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_debug_fp.c
@@ -0,0 +1,363 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "i915_reg.h"
+#include "i915_debug.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_memory.h"
+
+
+static void
+PRINTF(
+   struct debug_stream  *stream,
+   const char           *fmt,
+                        ... )
+{
+   va_list  args;
+
+   va_start( args, fmt );
+   debug_vprintf( fmt, args );
+   va_end( args );
+}
+
+
+static const char *opcodes[0x20] = {
+   "NOP",
+   "ADD",
+   "MOV",
+   "MUL",
+   "MAD",
+   "DP2ADD",
+   "DP3",
+   "DP4",
+   "FRC",
+   "RCP",
+   "RSQ",
+   "EXP",
+   "LOG",
+   "CMP",
+   "MIN",
+   "MAX",
+   "FLR",
+   "MOD",
+   "TRC",
+   "SGE",
+   "SLT",
+   "TEXLD",
+   "TEXLDP",
+   "TEXLDB",
+   "TEXKILL",
+   "DCL",
+   "0x1a",
+   "0x1b",
+   "0x1c",
+   "0x1d",
+   "0x1e",
+   "0x1f",
+};
+
+
+static const int args[0x20] = {
+   0,                           /* 0 nop */
+   2,                           /* 1 add */
+   1,                           /* 2 mov */
+   2,                           /* 3 m ul */
+   3,                           /* 4 mad */
+   3,                           /* 5 dp2add */
+   2,                           /* 6 dp3 */
+   2,                           /* 7 dp4 */
+   1,                           /* 8 frc */
+   1,                           /* 9 rcp */
+   1,                           /* a rsq */
+   1,                           /* b exp */
+   1,                           /* c log */
+   3,                           /* d cmp */
+   2,                           /* e min */
+   2,                           /* f max */
+   1,                           /* 10 flr */
+   1,                           /* 11 mod */
+   1,                           /* 12 trc */
+   2,                           /* 13 sge */
+   2,                           /* 14 slt */
+   1,
+   1,
+   1,
+   1,
+   0,
+   0,
+   0,
+   0,
+   0,
+   0,
+   0,
+};
+
+
+static const char *regname[0x8] = {
+   "R",
+   "T",
+   "CONST",
+   "S",
+   "OC",
+   "OD",
+   "U",
+   "UNKNOWN",
+};
+
+static void
+print_reg_type_nr(struct debug_stream *stream, unsigned type, unsigned nr)
+{
+   switch (type) {
+   case REG_TYPE_T:
+      switch (nr) {
+      case T_DIFFUSE:
+         PRINTF(stream, "T_DIFFUSE");
+         return;
+      case T_SPECULAR:
+         PRINTF(stream, "T_SPECULAR");
+         return;
+      case T_FOG_W:
+         PRINTF(stream, "T_FOG_W");
+         return;
+      default:
+         PRINTF(stream, "T_TEX%d", nr);
+         return;
+      }
+   case REG_TYPE_OC:
+      if (nr == 0) {
+         PRINTF(stream, "oC");
+         return;
+      }
+      break;
+   case REG_TYPE_OD:
+      if (nr == 0) {
+         PRINTF(stream, "oD");
+         return;
+      }
+      break;
+   default:
+      break;
+   }
+
+   PRINTF(stream, "%s[%d]", regname[type], nr);
+}
+
+#define REG_SWIZZLE_MASK 0x7777
+#define REG_NEGATE_MASK 0x8888
+
+#define REG_SWIZZLE_XYZW ((SRC_X << A2_SRC2_CHANNEL_X_SHIFT) |	\
+		      (SRC_Y << A2_SRC2_CHANNEL_Y_SHIFT) |	\
+		      (SRC_Z << A2_SRC2_CHANNEL_Z_SHIFT) |	\
+		      (SRC_W << A2_SRC2_CHANNEL_W_SHIFT))
+
+
+static void
+print_reg_neg_swizzle(struct debug_stream *stream, unsigned reg)
+{
+   int i;
+
+   if ((reg & REG_SWIZZLE_MASK) == REG_SWIZZLE_XYZW &&
+       (reg & REG_NEGATE_MASK) == 0)
+      return;
+
+   PRINTF(stream, ".");
+
+   for (i = 3; i >= 0; i--) {
+      if (reg & (1 << ((i * 4) + 3)))
+         PRINTF(stream, "-");
+
+      switch ((reg >> (i * 4)) & 0x7) {
+      case 0:
+         PRINTF(stream, "x");
+         break;
+      case 1:
+         PRINTF(stream, "y");
+         break;
+      case 2:
+         PRINTF(stream, "z");
+         break;
+      case 3:
+         PRINTF(stream, "w");
+         break;
+      case 4:
+         PRINTF(stream, "0");
+         break;
+      case 5:
+         PRINTF(stream, "1");
+         break;
+      default:
+         PRINTF(stream, "?");
+         break;
+      }
+   }
+}
+
+
+static void
+print_src_reg(struct debug_stream *stream, unsigned dword)
+{
+   unsigned nr = (dword >> A2_SRC2_NR_SHIFT) & REG_NR_MASK;
+   unsigned type = (dword >> A2_SRC2_TYPE_SHIFT) & REG_TYPE_MASK;
+   print_reg_type_nr(stream, type, nr);
+   print_reg_neg_swizzle(stream, dword);
+}
+
+
+static void
+print_dest_reg(struct debug_stream *stream, unsigned dword)
+{
+   unsigned nr = (dword >> A0_DEST_NR_SHIFT) & REG_NR_MASK;
+   unsigned type = (dword >> A0_DEST_TYPE_SHIFT) & REG_TYPE_MASK;
+   print_reg_type_nr(stream, type, nr);
+   if ((dword & A0_DEST_CHANNEL_ALL) == A0_DEST_CHANNEL_ALL)
+      return;
+   PRINTF(stream, ".");
+   if (dword & A0_DEST_CHANNEL_X)
+      PRINTF(stream, "x");
+   if (dword & A0_DEST_CHANNEL_Y)
+      PRINTF(stream, "y");
+   if (dword & A0_DEST_CHANNEL_Z)
+      PRINTF(stream, "z");
+   if (dword & A0_DEST_CHANNEL_W)
+      PRINTF(stream, "w");
+}
+
+
+#define GET_SRC0_REG(r0, r1) ((r0<<14)|(r1>>A1_SRC0_CHANNEL_W_SHIFT))
+#define GET_SRC1_REG(r0, r1) ((r0<<8)|(r1>>A2_SRC1_CHANNEL_W_SHIFT))
+#define GET_SRC2_REG(r)      (r)
+
+
+static void
+print_arith_op(struct debug_stream *stream, 
+	       unsigned opcode, const unsigned * program)
+{
+   if (opcode != A0_NOP) {
+      print_dest_reg(stream, program[0]);
+      if (program[0] & A0_DEST_SATURATE)
+         PRINTF(stream, " = SATURATE ");
+      else
+         PRINTF(stream, " = ");
+   }
+
+   PRINTF(stream, "%s ", opcodes[opcode]);
+
+   print_src_reg(stream, GET_SRC0_REG(program[0], program[1]));
+   if (args[opcode] == 1) {
+      PRINTF(stream, "\n");
+      return;
+   }
+
+   PRINTF(stream, ", ");
+   print_src_reg(stream, GET_SRC1_REG(program[1], program[2]));
+   if (args[opcode] == 2) {
+      PRINTF(stream, "\n");
+      return;
+   }
+
+   PRINTF(stream, ", ");
+   print_src_reg(stream, GET_SRC2_REG(program[2]));
+   PRINTF(stream, "\n");
+   return;
+}
+
+
+static void
+print_tex_op(struct debug_stream *stream, 
+	     unsigned opcode, const unsigned * program)
+{
+   print_dest_reg(stream, program[0] | A0_DEST_CHANNEL_ALL);
+   PRINTF(stream, " = ");
+
+   PRINTF(stream, "%s ", opcodes[opcode]);
+
+   PRINTF(stream, "S[%d],", program[0] & T0_SAMPLER_NR_MASK);
+
+   print_reg_type_nr(stream, 
+		     (program[1] >> T1_ADDRESS_REG_TYPE_SHIFT) &
+                     REG_TYPE_MASK,
+                     (program[1] >> T1_ADDRESS_REG_NR_SHIFT) & REG_NR_MASK);
+   PRINTF(stream, "\n");
+}
+
+static void
+print_texkil_op(struct debug_stream *stream, 
+                unsigned opcode, const unsigned * program)
+{
+   PRINTF(stream, "TEXKIL ");
+
+   print_reg_type_nr(stream, 
+		     (program[1] >> T1_ADDRESS_REG_TYPE_SHIFT) &
+                     REG_TYPE_MASK,
+                     (program[1] >> T1_ADDRESS_REG_NR_SHIFT) & REG_NR_MASK);
+   PRINTF(stream, "\n");
+}
+
+static void
+print_dcl_op(struct debug_stream *stream, 
+	     unsigned opcode, const unsigned * program)
+{
+   PRINTF(stream, "%s ", opcodes[opcode]);
+   print_dest_reg(stream, 
+		  program[0] | A0_DEST_CHANNEL_ALL);
+   PRINTF(stream, "\n");
+}
+
+
+void
+i915_disassemble_program(struct debug_stream *stream, 
+			 const unsigned * program, unsigned sz)
+{
+   unsigned i;
+
+   PRINTF(stream, "\t\tBEGIN\n");
+
+   assert((program[0] & 0x1ff) + 2 == sz);
+
+   program++;
+   for (i = 1; i < sz; i += 3, program += 3) {
+      unsigned opcode = program[0] & (0x1f << 24);
+
+      PRINTF(stream, "\t\t");
+
+      if ((int) opcode >= A0_NOP && opcode <= A0_SLT)
+         print_arith_op(stream, opcode >> 24, program);
+      else if (opcode >= T0_TEXLD && opcode < T0_TEXKILL)
+         print_tex_op(stream, opcode >> 24, program);
+      else if (opcode == T0_TEXKILL)
+         print_texkil_op(stream, opcode >> 24, program);
+      else if (opcode == D0_DCL)
+         print_dcl_op(stream, opcode >> 24, program);
+      else
+         PRINTF(stream, "Unknown opcode 0x%x\n", opcode);
+   }
+
+   PRINTF(stream, "\t\tEND\n\n");
+}
+
+
diff --git a/src/gallium/drivers/i915simple/i915_flush.c b/src/gallium/drivers/i915simple/i915_flush.c
new file mode 100644
index 0000000000..472e0ab774
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_flush.c
@@ -0,0 +1,78 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "draw/draw_context.h"
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_batch.h"
+
+
+static void i915_flush( struct pipe_context *pipe,
+                        unsigned flags,
+                        struct pipe_fence_handle **fence )
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   draw_flush(i915->draw);
+
+   /* Do we need to emit an MI_FLUSH command to flush the hardware
+    * caches?
+    */
+   if (flags & (PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_TEXTURE_CACHE)) {
+      unsigned flush = MI_FLUSH;
+      
+      if (!(flags & PIPE_FLUSH_RENDER_CACHE))
+	 flush |= INHIBIT_FLUSH_RENDER_CACHE;
+
+      if (flags & PIPE_FLUSH_TEXTURE_CACHE)
+	 flush |= FLUSH_MAP_CACHE;
+
+      if (!BEGIN_BATCH(1, 0)) {
+	 FLUSH_BATCH(NULL);
+	 assert(BEGIN_BATCH(1, 0));
+      }
+      OUT_BATCH( flush );
+   }
+
+   /* If there are no flags, just flush pending commands to hardware:
+    */
+   FLUSH_BATCH(fence);
+   i915->vbo_flushed = 1;
+}
+
+
+
+void i915_init_flush_functions( struct i915_context *i915 )
+{
+   i915->pipe.flush = i915_flush;
+}
diff --git a/src/gallium/drivers/i915simple/i915_fpc.h b/src/gallium/drivers/i915simple/i915_fpc.h
new file mode 100644
index 0000000000..2f0f99d046
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_fpc.h
@@ -0,0 +1,207 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef I915_FPC_H
+#define I915_FPC_H
+
+
+#include "i915_context.h"
+#include "i915_reg.h"
+
+
+
+#define I915_PROGRAM_SIZE 192
+
+
+
+/**
+ * Program translation state
+ */
+struct i915_fp_compile {
+   struct i915_fragment_shader *shader;  /* the shader we're compiling */
+
+   boolean used_constants[I915_MAX_CONSTANT];
+
+   /** maps TGSI immediate index to constant slot */
+   uint num_immediates;
+   uint immediates_map[I915_MAX_CONSTANT];
+   float immediates[I915_MAX_CONSTANT][4];
+
+   boolean first_instruction;
+
+   uint declarations[I915_PROGRAM_SIZE];
+   uint program[I915_PROGRAM_SIZE];
+
+   uint *csr;            /**< Cursor, points into program. */
+
+   uint *decl;           /**< Cursor, points into declarations. */
+
+   uint decl_s;          /**< flags for which s regs need to be decl'd */
+   uint decl_t;          /**< flags for which t regs need to be decl'd */
+
+   uint temp_flag;       /**< Tracks temporary regs which are in use */
+   uint utemp_flag;      /**< Tracks TYPE_U temporary regs which are in use */
+
+   uint nr_tex_indirect;
+   uint nr_tex_insn;
+   uint nr_alu_insn;
+   uint nr_decl_insn;
+
+   boolean error;      /**< Set if i915_program_error() is called */
+   uint wpos_tex;
+   uint NumNativeInstructions;
+   uint NumNativeAluInstructions;
+   uint NumNativeTexInstructions;
+   uint NumNativeTexIndirections;
+};
+
+
+/* Having zero and one in here makes the definition of swizzle a lot
+ * easier.
+ */
+#define UREG_TYPE_SHIFT               29
+#define UREG_NR_SHIFT                 24
+#define UREG_CHANNEL_X_NEGATE_SHIFT   23
+#define UREG_CHANNEL_X_SHIFT          20
+#define UREG_CHANNEL_Y_NEGATE_SHIFT   19
+#define UREG_CHANNEL_Y_SHIFT          16
+#define UREG_CHANNEL_Z_NEGATE_SHIFT   15
+#define UREG_CHANNEL_Z_SHIFT          12
+#define UREG_CHANNEL_W_NEGATE_SHIFT   11
+#define UREG_CHANNEL_W_SHIFT          8
+#define UREG_CHANNEL_ZERO_NEGATE_MBZ  5
+#define UREG_CHANNEL_ZERO_SHIFT       4
+#define UREG_CHANNEL_ONE_NEGATE_MBZ   1
+#define UREG_CHANNEL_ONE_SHIFT        0
+
+#define UREG_BAD          0xffffffff    /* not a valid ureg */
+
+#define X    SRC_X
+#define Y    SRC_Y
+#define Z    SRC_Z
+#define W    SRC_W
+#define ZERO SRC_ZERO
+#define ONE  SRC_ONE
+
+/* Construct a ureg:
+ */
+#define UREG( type, nr ) (((type)<< UREG_TYPE_SHIFT) |		\
+			  ((nr)  << UREG_NR_SHIFT) |		\
+			  (X     << UREG_CHANNEL_X_SHIFT) |	\
+			  (Y     << UREG_CHANNEL_Y_SHIFT) |	\
+			  (Z     << UREG_CHANNEL_Z_SHIFT) |	\
+			  (W     << UREG_CHANNEL_W_SHIFT) |	\
+			  (ZERO  << UREG_CHANNEL_ZERO_SHIFT) |	\
+			  (ONE   << UREG_CHANNEL_ONE_SHIFT))
+
+#define GET_CHANNEL_SRC( reg, channel ) ((reg<<(channel*4)) & (0xf<<20))
+#define CHANNEL_SRC( src, channel ) (src>>(channel*4))
+
+#define GET_UREG_TYPE(reg) (((reg)>>UREG_TYPE_SHIFT)&REG_TYPE_MASK)
+#define GET_UREG_NR(reg)   (((reg)>>UREG_NR_SHIFT)&REG_NR_MASK)
+
+
+
+#define UREG_XYZW_CHANNEL_MASK 0x00ffff00
+
+/* One neat thing about the UREG representation:  
+ */
+static INLINE int
+swizzle(int reg, uint x, uint y, uint z, uint w)
+{
+   assert(x <= SRC_ONE);
+   assert(y <= SRC_ONE);
+   assert(z <= SRC_ONE);
+   assert(w <= SRC_ONE);
+   return ((reg & ~UREG_XYZW_CHANNEL_MASK) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, x), 0) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, y), 1) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, z), 2) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, w), 3));
+}
+
+
+
+/***********************************************************************
+ * Public interface for the compiler
+ */
+extern void
+i915_translate_fragment_program( struct i915_context *i915,
+                                 struct i915_fragment_shader *fs);
+
+
+
+extern uint i915_get_temp(struct i915_fp_compile *p);
+extern uint i915_get_utemp(struct i915_fp_compile *p);
+extern void i915_release_utemps(struct i915_fp_compile *p);
+
+
+extern uint i915_emit_texld(struct i915_fp_compile *p,
+                              uint dest,
+                              uint destmask,
+                              uint sampler, uint coord, uint op);
+
+extern uint i915_emit_arith(struct i915_fp_compile *p,
+                              uint op,
+                              uint dest,
+                              uint mask,
+                              uint saturate,
+                              uint src0, uint src1, uint src2);
+
+extern uint i915_emit_decl(struct i915_fp_compile *p,
+                             uint type, uint nr, uint d0_flags);
+
+
+extern uint i915_emit_const1f(struct i915_fp_compile *p, float c0);
+
+extern uint i915_emit_const2f(struct i915_fp_compile *p,
+                                float c0, float c1);
+
+extern uint i915_emit_const4fv(struct i915_fp_compile *p,
+                                 const float * c);
+
+extern uint i915_emit_const4f(struct i915_fp_compile *p,
+                                float c0, float c1,
+                                float c2, float c3);
+
+
+/*======================================================================
+ * i915_fpc_debug.c
+ */
+extern void i915_disassemble_program(const uint * program, uint sz);
+
+
+/*======================================================================
+ * i915_fpc_translate.c
+ */
+
+extern void
+i915_program_error(struct i915_fp_compile *p, const char *msg, ...);
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_fpc_emit.c b/src/gallium/drivers/i915simple/i915_fpc_emit.c
new file mode 100644
index 0000000000..b054ce41d3
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_fpc_emit.c
@@ -0,0 +1,375 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_fpc.h"
+#include "util/u_math.h"
+
+
+#define A0_DEST( reg ) (((reg)&UREG_TYPE_NR_MASK)>>UREG_A0_DEST_SHIFT_LEFT)
+#define D0_DEST( reg ) (((reg)&UREG_TYPE_NR_MASK)>>UREG_A0_DEST_SHIFT_LEFT)
+#define T0_DEST( reg ) (((reg)&UREG_TYPE_NR_MASK)>>UREG_A0_DEST_SHIFT_LEFT)
+#define A0_SRC0( reg ) (((reg)&UREG_MASK)>>UREG_A0_SRC0_SHIFT_LEFT)
+#define A1_SRC0( reg ) (((reg)&UREG_MASK)<<UREG_A1_SRC0_SHIFT_RIGHT)
+#define A1_SRC1( reg ) (((reg)&UREG_MASK)>>UREG_A1_SRC1_SHIFT_LEFT)
+#define A2_SRC1( reg ) (((reg)&UREG_MASK)<<UREG_A2_SRC1_SHIFT_RIGHT)
+#define A2_SRC2( reg ) (((reg)&UREG_MASK)>>UREG_A2_SRC2_SHIFT_LEFT)
+
+/* These are special, and don't have swizzle/negate bits.
+ */
+#define T0_SAMPLER( reg )     (GET_UREG_NR(reg)<<T0_SAMPLER_NR_SHIFT)
+#define T1_ADDRESS_REG( reg ) ((GET_UREG_NR(reg)<<T1_ADDRESS_REG_NR_SHIFT) | \
+			       (GET_UREG_TYPE(reg)<<T1_ADDRESS_REG_TYPE_SHIFT))
+
+
+/* Macros for translating UREG's into the various register fields used
+ * by the I915 programmable unit.
+ */
+#define UREG_A0_DEST_SHIFT_LEFT  (UREG_TYPE_SHIFT - A0_DEST_TYPE_SHIFT)
+#define UREG_A0_SRC0_SHIFT_LEFT  (UREG_TYPE_SHIFT - A0_SRC0_TYPE_SHIFT)
+#define UREG_A1_SRC0_SHIFT_RIGHT (A1_SRC0_CHANNEL_W_SHIFT - UREG_CHANNEL_W_SHIFT)
+#define UREG_A1_SRC1_SHIFT_LEFT  (UREG_TYPE_SHIFT - A1_SRC1_TYPE_SHIFT)
+#define UREG_A2_SRC1_SHIFT_RIGHT (A2_SRC1_CHANNEL_W_SHIFT - UREG_CHANNEL_W_SHIFT)
+#define UREG_A2_SRC2_SHIFT_LEFT  (UREG_TYPE_SHIFT - A2_SRC2_TYPE_SHIFT)
+
+#define UREG_MASK         0xffffff00
+#define UREG_TYPE_NR_MASK ((REG_TYPE_MASK << UREG_TYPE_SHIFT) | \
+  			   (REG_NR_MASK << UREG_NR_SHIFT))
+
+
+uint
+i915_get_temp(struct i915_fp_compile *p)
+{
+   int bit = ffs(~p->temp_flag);
+   if (!bit) {
+      i915_program_error(p, "i915_get_temp: out of temporaries\n");
+      return 0;
+   }
+
+   p->temp_flag |= 1 << (bit - 1);
+   return bit - 1;
+}
+
+
+static void
+i915_release_temp(struct i915_fp_compile *p, int reg)
+{
+   p->temp_flag &= ~(1 << reg);
+}
+
+
+/**
+ * Get unpreserved temporary, a temp whose value is not preserved between
+ * PS program phases.
+ */
+uint
+i915_get_utemp(struct i915_fp_compile * p)
+{
+   int bit = ffs(~p->utemp_flag);
+   if (!bit) {
+      i915_program_error(p, "i915_get_utemp: out of temporaries\n");
+      return 0;
+   }
+
+   p->utemp_flag |= 1 << (bit - 1);
+   return UREG(REG_TYPE_U, (bit - 1));
+}
+
+void
+i915_release_utemps(struct i915_fp_compile *p)
+{
+   p->utemp_flag = ~0x7;
+}
+
+
+uint
+i915_emit_decl(struct i915_fp_compile *p,
+               uint type, uint nr, uint d0_flags)
+{
+   uint reg = UREG(type, nr);
+
+   if (type == REG_TYPE_T) {
+      if (p->decl_t & (1 << nr))
+         return reg;
+
+      p->decl_t |= (1 << nr);
+   }
+   else if (type == REG_TYPE_S) {
+      if (p->decl_s & (1 << nr))
+         return reg;
+
+      p->decl_s |= (1 << nr);
+   }
+   else
+      return reg;
+
+   *(p->decl++) = (D0_DCL | D0_DEST(reg) | d0_flags);
+   *(p->decl++) = D1_MBZ;
+   *(p->decl++) = D2_MBZ;
+
+   p->nr_decl_insn++;
+   return reg;
+}
+
+uint
+i915_emit_arith(struct i915_fp_compile * p,
+                uint op,
+                uint dest,
+                uint mask,
+                uint saturate, uint src0, uint src1, uint src2)
+{
+   uint c[3];
+   uint nr_const = 0;
+
+   assert(GET_UREG_TYPE(dest) != REG_TYPE_CONST);
+   dest = UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest));
+   assert(dest);
+
+   if (GET_UREG_TYPE(src0) == REG_TYPE_CONST)
+      c[nr_const++] = 0;
+   if (GET_UREG_TYPE(src1) == REG_TYPE_CONST)
+      c[nr_const++] = 1;
+   if (GET_UREG_TYPE(src2) == REG_TYPE_CONST)
+      c[nr_const++] = 2;
+
+   /* Recursively call this function to MOV additional const values
+    * into temporary registers.  Use utemp registers for this -
+    * currently shouldn't be possible to run out, but keep an eye on
+    * this.
+    */
+   if (nr_const > 1) {
+      uint s[3], first, i, old_utemp_flag;
+
+      s[0] = src0;
+      s[1] = src1;
+      s[2] = src2;
+      old_utemp_flag = p->utemp_flag;
+
+      first = GET_UREG_NR(s[c[0]]);
+      for (i = 1; i < nr_const; i++) {
+         if (GET_UREG_NR(s[c[i]]) != first) {
+            uint tmp = i915_get_utemp(p);
+
+            i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0,
+                            s[c[i]], 0, 0);
+            s[c[i]] = tmp;
+         }
+      }
+
+      src0 = s[0];
+      src1 = s[1];
+      src2 = s[2];
+      p->utemp_flag = old_utemp_flag;   /* restore */
+   }
+
+   *(p->csr++) = (op | A0_DEST(dest) | mask | saturate | A0_SRC0(src0));
+   *(p->csr++) = (A1_SRC0(src0) | A1_SRC1(src1));
+   *(p->csr++) = (A2_SRC1(src1) | A2_SRC2(src2));
+
+   p->nr_alu_insn++;
+   return dest;
+}
+
+
+/**
+ * Emit a texture load or texkill instruction.
+ * \param dest  the dest i915 register
+ * \param destmask  the dest register writemask
+ * \param sampler  the i915 sampler register
+ * \param coord  the i915 source texcoord operand
+ * \param opcode  the instruction opcode
+ */
+uint i915_emit_texld( struct i915_fp_compile *p,
+			uint dest,
+			uint destmask,
+			uint sampler,
+			uint coord,
+			uint opcode )
+{
+   const uint k = UREG(GET_UREG_TYPE(coord), GET_UREG_NR(coord));
+   int temp = -1;
+
+   if (coord != k) {
+      /* texcoord is swizzled or negated.  Need to allocate a new temporary
+       * register (a utemp / unpreserved temp) won't do.
+       */
+      uint tempReg;
+
+      temp = i915_get_temp(p);           /* get temp reg index */
+      tempReg = UREG(REG_TYPE_R, temp);  /* make i915 register */
+
+      i915_emit_arith( p, A0_MOV,
+                       tempReg, A0_DEST_CHANNEL_ALL, /* dest reg, writemask */
+                       0,                            /* saturate */
+                       coord, 0, 0 );                /* src0, src1, src2 */
+
+      /* new src texcoord is tempReg */
+      coord = tempReg;
+   }
+
+   /* Don't worry about saturate as we only support  
+    */
+   if (destmask != A0_DEST_CHANNEL_ALL) {
+      /* if not writing to XYZW... */
+      uint tmp = i915_get_utemp(p);
+      i915_emit_texld( p, tmp, A0_DEST_CHANNEL_ALL, sampler, coord, opcode );
+      i915_emit_arith( p, A0_MOV, dest, destmask, 0, tmp, 0, 0 );
+      /* XXX release utemp here? */
+   }
+   else {
+      assert(GET_UREG_TYPE(dest) != REG_TYPE_CONST);
+      assert(dest = UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest)));
+
+      /* is the sampler coord a texcoord input reg? */
+      if (GET_UREG_TYPE(coord) != REG_TYPE_T) {
+	 p->nr_tex_indirect++;
+      }
+
+      *(p->csr++) = (opcode | 
+		     T0_DEST( dest ) |
+		     T0_SAMPLER( sampler ));
+
+      *(p->csr++) = T1_ADDRESS_REG( coord );
+      *(p->csr++) = T2_MBZ;
+
+      p->nr_tex_insn++;
+   }
+
+   if (temp >= 0)
+      i915_release_temp(p, temp);
+
+   return dest;
+}
+
+
+uint
+i915_emit_const1f(struct i915_fp_compile * p, float c0)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned reg, idx;
+
+   if (c0 == 0.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
+   if (c0 == 1.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE);
+
+   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      if (ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
+         continue;
+      for (idx = 0; idx < 4; idx++) {
+         if (!(ifs->constant_flags[reg] & (1 << idx)) ||
+             ifs->constants[reg][idx] == c0) {
+            ifs->constants[reg][idx] = c0;
+            ifs->constant_flags[reg] |= 1 << idx;
+            if (reg + 1 > ifs->num_constants)
+               ifs->num_constants = reg + 1;
+            return swizzle(UREG(REG_TYPE_CONST, reg), idx, ZERO, ZERO, ONE);
+         }
+      }
+   }
+
+   i915_program_error(p, "i915_emit_const1f: out of constants\n");
+   return 0;
+}
+
+uint
+i915_emit_const2f(struct i915_fp_compile * p, float c0, float c1)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned reg, idx;
+
+   if (c0 == 0.0)
+      return swizzle(i915_emit_const1f(p, c1), ZERO, X, Z, W);
+   if (c0 == 1.0)
+      return swizzle(i915_emit_const1f(p, c1), ONE, X, Z, W);
+
+   if (c1 == 0.0)
+      return swizzle(i915_emit_const1f(p, c0), X, ZERO, Z, W);
+   if (c1 == 1.0)
+      return swizzle(i915_emit_const1f(p, c0), X, ONE, Z, W);
+
+   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      if (ifs->constant_flags[reg] == 0xf ||
+          ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
+         continue;
+      for (idx = 0; idx < 3; idx++) {
+         if (!(ifs->constant_flags[reg] & (3 << idx))) {
+            ifs->constants[reg][idx + 0] = c0;
+            ifs->constants[reg][idx + 1] = c1;
+            ifs->constant_flags[reg] |= 3 << idx;
+            if (reg + 1 > ifs->num_constants)
+               ifs->num_constants = reg + 1;
+            return swizzle(UREG(REG_TYPE_CONST, reg), idx, idx + 1, ZERO, ONE);
+         }
+      }
+   }
+
+   i915_program_error(p, "i915_emit_const2f: out of constants\n");
+   return 0;
+}
+
+
+
+uint
+i915_emit_const4f(struct i915_fp_compile * p,
+                  float c0, float c1, float c2, float c3)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned reg;
+
+   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      if (ifs->constant_flags[reg] == 0xf &&
+          ifs->constants[reg][0] == c0 &&
+          ifs->constants[reg][1] == c1 &&
+          ifs->constants[reg][2] == c2 &&
+          ifs->constants[reg][3] == c3) {
+         return UREG(REG_TYPE_CONST, reg);
+      }
+      else if (ifs->constant_flags[reg] == 0) {
+
+         ifs->constants[reg][0] = c0;
+         ifs->constants[reg][1] = c1;
+         ifs->constants[reg][2] = c2;
+         ifs->constants[reg][3] = c3;
+         ifs->constant_flags[reg] = 0xf;
+         if (reg + 1 > ifs->num_constants)
+            ifs->num_constants = reg + 1;
+         return UREG(REG_TYPE_CONST, reg);
+      }
+   }
+
+   i915_program_error(p, "i915_emit_const4f: out of constants\n");
+   return 0;
+}
+
+
+uint
+i915_emit_const4fv(struct i915_fp_compile * p, const float * c)
+{
+   return i915_emit_const4f(p, c[0], c[1], c[2], c[3]);
+}
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c
new file mode 100644
index 0000000000..d92bdc1bc6
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c
@@ -0,0 +1,1190 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdarg.h>
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_fpc.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "draw/draw_vertex.h"
+
+
+/**
+ * Simple pass-through fragment shader to use when we don't have
+ * a real shader (or it fails to compile for some reason).
+ */
+static unsigned passthrough[] = 
+{
+   _3DSTATE_PIXEL_SHADER_PROGRAM | ((2*3)-1),
+
+   /* declare input color:
+    */
+   (D0_DCL | 
+    (REG_TYPE_T << D0_TYPE_SHIFT) | 
+    (T_DIFFUSE << D0_NR_SHIFT) | 
+    D0_CHANNEL_ALL),
+   0,
+   0,
+
+   /* move to output color:
+    */
+   (A0_MOV | 
+    (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | 
+    A0_DEST_CHANNEL_ALL | 
+    (REG_TYPE_T << A0_SRC0_TYPE_SHIFT) |
+    (T_DIFFUSE << A0_SRC0_NR_SHIFT)),
+   0x01230000,			/* .xyzw */
+   0
+};
+
+
+/* 1, -1/3!, 1/5!, -1/7! */
+static const float sin_constants[4] = { 1.0,
+   -1.0f / (3 * 2 * 1),
+   1.0f / (5 * 4 * 3 * 2 * 1),
+   -1.0f / (7 * 6 * 5 * 4 * 3 * 2 * 1)
+};
+
+/* 1, -1/2!, 1/4!, -1/6! */
+static const float cos_constants[4] = { 1.0,
+   -1.0f / (2 * 1),
+   1.0f / (4 * 3 * 2 * 1),
+   -1.0f / (6 * 5 * 4 * 3 * 2 * 1)
+};
+
+
+
+/**
+ * component-wise negation of ureg
+ */
+static INLINE int
+negate(int reg, int x, int y, int z, int w)
+{
+   /* Another neat thing about the UREG representation */
+   return reg ^ (((x & 1) << UREG_CHANNEL_X_NEGATE_SHIFT) |
+                 ((y & 1) << UREG_CHANNEL_Y_NEGATE_SHIFT) |
+                 ((z & 1) << UREG_CHANNEL_Z_NEGATE_SHIFT) |
+                 ((w & 1) << UREG_CHANNEL_W_NEGATE_SHIFT));
+}
+
+
+/**
+ * In the event of a translation failure, we'll generate a simple color
+ * pass-through program.
+ */
+static void
+i915_use_passthrough_shader(struct i915_fragment_shader *fs)
+{
+   fs->program = (uint *) MALLOC(sizeof(passthrough));
+   if (fs->program) {
+      memcpy(fs->program, passthrough, sizeof(passthrough));
+      fs->program_len = Elements(passthrough);
+   }
+   fs->num_constants = 0;
+}
+
+
+void
+i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
+{
+   va_list args;
+   char buffer[1024];
+
+   debug_printf("i915_program_error: ");
+   va_start( args, msg );  
+   util_vsnprintf( buffer, sizeof(buffer), msg, args );
+   va_end( args );
+   debug_printf(buffer);
+   debug_printf("\n");
+
+   p->error = 1;
+}
+
+
+
+/**
+ * Construct a ureg for the given source register.  Will emit
+ * constants, apply swizzling and negation as needed.
+ */
+static uint
+src_vector(struct i915_fp_compile *p,
+           const struct tgsi_full_src_register *source)
+{
+   uint index = source->SrcRegister.Index;
+   uint src = 0, sem_name, sem_ind;
+
+   switch (source->SrcRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (source->SrcRegister.Index >= I915_MAX_TEMPORARY) {
+         i915_program_error(p, "Exceeded max temporary reg");
+         return 0;
+      }
+      src = UREG(REG_TYPE_R, index);
+      break;
+   case TGSI_FILE_INPUT:
+      /* XXX: Packing COL1, FOGC into a single attribute works for
+       * texenv programs, but will fail for real fragment programs
+       * that use these attributes and expect them to be a full 4
+       * components wide.  Could use a texcoord to pass these
+       * attributes if necessary, but that won't work in the general
+       * case.
+       * 
+       * We also use a texture coordinate to pass wpos when possible.
+       */
+
+      sem_name = p->shader->info.input_semantic_name[index];
+      sem_ind = p->shader->info.input_semantic_index[index];
+
+      switch (sem_name) {
+      case TGSI_SEMANTIC_POSITION:
+         debug_printf("SKIP SEM POS\n");
+         /*
+         assert(p->wpos_tex != -1);
+         src = i915_emit_decl(p, REG_TYPE_T, p->wpos_tex, D0_CHANNEL_ALL);
+         */
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         if (sem_ind == 0) {
+            src = i915_emit_decl(p, REG_TYPE_T, T_DIFFUSE, D0_CHANNEL_ALL);
+         }
+         else {
+            /* secondary color */
+            assert(sem_ind == 1);
+            src = i915_emit_decl(p, REG_TYPE_T, T_SPECULAR, D0_CHANNEL_XYZ);
+            src = swizzle(src, X, Y, Z, ONE);
+         }
+         break;
+      case TGSI_SEMANTIC_FOG:
+         src = i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W);
+         src = swizzle(src, W, W, W, W);
+         break;
+      case TGSI_SEMANTIC_GENERIC:
+         /* usually a texcoord */
+         src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + sem_ind, D0_CHANNEL_ALL);
+         break;
+      default:
+         i915_program_error(p, "Bad source->Index");
+         return 0;
+      }
+      break;
+
+   case TGSI_FILE_IMMEDIATE:
+      assert(index < p->num_immediates);
+      index = p->immediates_map[index];
+      /* fall-through */
+   case TGSI_FILE_CONSTANT:
+      src = UREG(REG_TYPE_CONST, index);
+      break;
+
+   default:
+      i915_program_error(p, "Bad source->File");
+      return 0;
+   }
+
+   if (source->SrcRegister.Extended) {
+      src = swizzle(src,
+                    source->SrcRegisterExtSwz.ExtSwizzleX,
+                    source->SrcRegisterExtSwz.ExtSwizzleY,
+                    source->SrcRegisterExtSwz.ExtSwizzleZ,
+                    source->SrcRegisterExtSwz.ExtSwizzleW);
+   }
+   else {
+      src = swizzle(src,
+                    source->SrcRegister.SwizzleX,
+                    source->SrcRegister.SwizzleY,
+                    source->SrcRegister.SwizzleZ,
+                    source->SrcRegister.SwizzleW);
+   }
+
+
+   /* There's both negate-all-components and per-component negation.
+    * Try to handle both here.
+    */
+   {
+      int nx = source->SrcRegisterExtSwz.NegateX;
+      int ny = source->SrcRegisterExtSwz.NegateY;
+      int nz = source->SrcRegisterExtSwz.NegateZ;
+      int nw = source->SrcRegisterExtSwz.NegateW;
+      if (source->SrcRegister.Negate) {
+         nx = !nx;
+         ny = !ny;
+         nz = !nz;
+         nw = !nw;
+      }
+      src = negate(src, nx, ny, nz, nw);
+   }
+
+   /* no abs() or post-abs negation */
+#if 0
+   /* XXX assertions disabled to allow arbfplight.c to run */
+   /* XXX enable these assertions, or fix things */
+   assert(!source->SrcRegisterExtMod.Absolute);
+   assert(!source->SrcRegisterExtMod.Negate);
+#endif
+   return src;
+}
+
+
+/**
+ * Construct a ureg for a destination register.
+ */
+static uint
+get_result_vector(struct i915_fp_compile *p,
+                  const struct tgsi_full_dst_register *dest)
+{
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         uint sem_name = p->shader->info.output_semantic_name[dest->DstRegister.Index];
+         switch (sem_name) {
+         case TGSI_SEMANTIC_POSITION:
+            return UREG(REG_TYPE_OD, 0);
+         case TGSI_SEMANTIC_COLOR:
+            return UREG(REG_TYPE_OC, 0);
+         default:
+            i915_program_error(p, "Bad inst->DstReg.Index/semantics");
+            return 0;
+         }
+      }
+   case TGSI_FILE_TEMPORARY:
+      return UREG(REG_TYPE_R, dest->DstRegister.Index);
+   default:
+      i915_program_error(p, "Bad inst->DstReg.File");
+      return 0;
+   }
+}
+
+
+/**
+ * Compute flags for saturation and writemask.
+ */
+static uint
+get_result_flags(const struct tgsi_full_instruction *inst)
+{
+   const uint writeMask
+      = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   uint flags = 0x0;
+
+   if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+      flags |= A0_DEST_SATURATE;
+
+   if (writeMask & TGSI_WRITEMASK_X)
+      flags |= A0_DEST_CHANNEL_X;
+   if (writeMask & TGSI_WRITEMASK_Y)
+      flags |= A0_DEST_CHANNEL_Y;
+   if (writeMask & TGSI_WRITEMASK_Z)
+      flags |= A0_DEST_CHANNEL_Z;
+   if (writeMask & TGSI_WRITEMASK_W)
+      flags |= A0_DEST_CHANNEL_W;
+
+   return flags;
+}
+
+
+/**
+ * Convert TGSI_TEXTURE_x token to DO_SAMPLE_TYPE_x token
+ */
+static uint
+translate_tex_src_target(struct i915_fp_compile *p, uint tex)
+{
+   switch (tex) {
+   case TGSI_TEXTURE_1D:
+      return D0_SAMPLE_TYPE_2D;
+   case TGSI_TEXTURE_2D:
+      return D0_SAMPLE_TYPE_2D;
+   case TGSI_TEXTURE_RECT:
+      return D0_SAMPLE_TYPE_2D;
+   case TGSI_TEXTURE_3D:
+      return D0_SAMPLE_TYPE_VOLUME;
+   case TGSI_TEXTURE_CUBE:
+      return D0_SAMPLE_TYPE_CUBE;
+   default:
+      i915_program_error(p, "TexSrc type");
+      return 0;
+   }
+}
+
+
+/**
+ * Generate texel lookup instruction.
+ */
+static void
+emit_tex(struct i915_fp_compile *p,
+         const struct tgsi_full_instruction *inst,
+         uint opcode)
+{
+   uint texture = inst->InstructionExtTexture.Texture;
+   uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint tex = translate_tex_src_target( p, texture );
+   uint sampler = i915_emit_decl(p, REG_TYPE_S, unit, tex);
+   uint coord = src_vector( p, &inst->FullSrcRegisters[0]);
+
+   i915_emit_texld( p,
+                    get_result_vector( p, &inst->FullDstRegisters[0] ),
+                    get_result_flags( inst ),
+                    sampler,
+                    coord,
+                    opcode);
+}
+
+
+/**
+ * Generate a simple arithmetic instruction
+ * \param opcode  the i915 opcode
+ * \param numArgs  the number of input/src arguments
+ */
+static void
+emit_simple_arith(struct i915_fp_compile *p,
+                  const struct tgsi_full_instruction *inst,
+                  uint opcode, uint numArgs)
+{
+   uint arg1, arg2, arg3;
+
+   assert(numArgs <= 3);
+
+   arg1 = (numArgs < 1) ? 0 : src_vector( p, &inst->FullSrcRegisters[0] );
+   arg2 = (numArgs < 2) ? 0 : src_vector( p, &inst->FullSrcRegisters[1] );
+   arg3 = (numArgs < 3) ? 0 : src_vector( p, &inst->FullSrcRegisters[2] );
+
+   i915_emit_arith( p,
+                    opcode,
+                    get_result_vector( p, &inst->FullDstRegisters[0]),
+                    get_result_flags( inst ), 0,
+                    arg1,
+                    arg2,
+                    arg3 );
+}
+
+
+/** As above, but swap the first two src regs */
+static void
+emit_simple_arith_swap2(struct i915_fp_compile *p,
+                        const struct tgsi_full_instruction *inst,
+                        uint opcode, uint numArgs)
+{
+   struct tgsi_full_instruction inst2;
+
+   assert(numArgs == 2);
+
+   /* transpose first two registers */
+   inst2 = *inst;
+   inst2.FullSrcRegisters[0] = inst->FullSrcRegisters[1];
+   inst2.FullSrcRegisters[1] = inst->FullSrcRegisters[0];
+
+   emit_simple_arith(p, &inst2, opcode, numArgs);
+}
+
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+/*
+ * Translate TGSI instruction to i915 instruction.
+ *
+ * Possible concerns:
+ *
+ * SIN, COS -- could use another taylor step?
+ * LIT      -- results seem a little different to sw mesa
+ * LOG      -- different to mesa on negative numbers, but this is conformant.
+ */ 
+static void
+i915_translate_instruction(struct i915_fp_compile *p,
+                           const struct tgsi_full_instruction *inst)
+{
+   uint writemask;
+   uint src0, src1, src2, flags;
+   uint tmp = 0;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      i915_emit_arith(p,
+                      A0_MAX,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      src0, negate(src0, 1, 1, 1, 1), 0);
+      break;
+
+   case TGSI_OPCODE_ADD:
+      emit_simple_arith(p, inst, A0_ADD, 2);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src2 = src_vector(p, &inst->FullSrcRegisters[2]);
+      i915_emit_arith(p, A0_CMP, 
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 
+                      0, src0, src2, src1);   /* NOTE: order of src2, src1 */
+      break;
+
+   case TGSI_OPCODE_COS:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0);
+
+      i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
+
+      /* By choosing different taylor constants, could get rid of this mul:
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      tmp, i915_emit_const1f(p, (float) (M_PI * 2.0)), 0);
+
+      /* 
+       * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
+       * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, 1
+       * t0 = MUL t0.xxz1 t0.z111    ; x^6 x^4 x^2 1
+       * result = DP4 t0, cos_constants
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XY, 0,
+                      swizzle(tmp, X, X, ONE, ONE),
+                      swizzle(tmp, X, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XYZ, 0,
+                      swizzle(tmp, X, Y, X, ONE),
+                      swizzle(tmp, X, X, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XYZ, 0,
+                      swizzle(tmp, X, X, Z, ONE),
+                      swizzle(tmp, Z, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_DP4,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(tmp, ONE, Z, Y, X),
+                      i915_emit_const4fv(p, cos_constants), 0);
+      break;
+
+   case TGSI_OPCODE_DP3:
+      emit_simple_arith(p, inst, A0_DP3, 2);
+      break;
+
+   case TGSI_OPCODE_DP4:
+      emit_simple_arith(p, inst, A0_DP4, 2);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+
+      i915_emit_arith(p,
+                      A0_DP4,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, Y, Z, ONE), src1, 0);
+      break;
+
+   case TGSI_OPCODE_DST:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+
+      /* result[0] = 1    * 1;
+       * result[1] = a[1] * b[1];
+       * result[2] = a[2] * 1;
+       * result[3] = 1    * b[3];
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, ONE, Y, Z, ONE),
+                      swizzle(src1, ONE, Y, ONE, W), 0);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_EX2:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+
+      i915_emit_arith(p,
+                      A0_EXP,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_FLR:
+      emit_simple_arith(p, inst, A0_FLR, 1);
+      break;
+
+   case TGSI_OPCODE_FRC:
+      emit_simple_arith(p, inst, A0_FRC, 1);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* kill if src[0].x < 0 || src[0].y < 0 ... */
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_texld(p,
+                      tmp,                   /* dest reg: a dummy reg */
+                      A0_DEST_CHANNEL_ALL,   /* dest writemask */
+                      0,                     /* sampler */
+                      src0,                  /* coord*/
+                      T0_TEXKILL);           /* opcode */
+      break;
+
+   case TGSI_OPCODE_KILP:
+      assert(0); /* not tested yet */
+      break;
+
+   case TGSI_OPCODE_LG2:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+
+      i915_emit_arith(p,
+                      A0_LOG,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_LIT:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      /* tmp = max( a.xyzw, a.00zw )
+       * XXX: Clamp tmp.w to -128..128
+       * tmp.y = log(tmp.y)
+       * tmp.y = tmp.w * tmp.y
+       * tmp.y = exp(tmp.y)
+       * result = cmp (a.11-x1, a.1x01, a.1xy1 )
+       */
+      i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      src0, swizzle(src0, ZERO, ZERO, Z, W), 0);
+
+      i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_Y, 0,
+                      swizzle(tmp, Y, Y, Y, Y), 0, 0);
+
+      i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_Y, 0,
+                      swizzle(tmp, ZERO, Y, ZERO, ZERO),
+                      swizzle(tmp, ZERO, W, ZERO, ZERO), 0);
+
+      i915_emit_arith(p, A0_EXP, tmp, A0_DEST_CHANNEL_Y, 0,
+                      swizzle(tmp, Y, Y, Y, Y), 0, 0);
+
+      i915_emit_arith(p, A0_CMP,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      negate(swizzle(tmp, ONE, ONE, X, ONE), 0, 0, 1, 0),
+                      swizzle(tmp, ONE, X, ZERO, ONE),
+                      swizzle(tmp, ONE, X, Y, ONE));
+
+      break;
+
+   case TGSI_OPCODE_LRP:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src2 = src_vector(p, &inst->FullSrcRegisters[2]);
+      flags = get_result_flags(inst);
+      tmp = i915_get_utemp(p);
+
+      /* b*a + c*(1-a)
+       *
+       * b*a + c - ca 
+       *
+       * tmp = b*a + c, 
+       * result = (-c)*a + tmp 
+       */
+      i915_emit_arith(p, A0_MAD, tmp,
+                      flags & A0_DEST_CHANNEL_ALL, 0, src1, src0, src2);
+
+      i915_emit_arith(p, A0_MAD,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      flags, 0, negate(src2, 1, 1, 1, 1), src0, tmp);
+      break;
+
+   case TGSI_OPCODE_MAD:
+      emit_simple_arith(p, inst, A0_MAD, 3);
+      break;
+
+   case TGSI_OPCODE_MAX:
+      emit_simple_arith(p, inst, A0_MAX, 2);
+      break;
+
+   case TGSI_OPCODE_MIN:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      tmp = i915_get_utemp(p);
+      flags = get_result_flags(inst);
+
+      i915_emit_arith(p,
+                      A0_MAX,
+                      tmp, flags & A0_DEST_CHANNEL_ALL, 0,
+                      negate(src0, 1, 1, 1, 1),
+                      negate(src1, 1, 1, 1, 1), 0);
+
+      i915_emit_arith(p,
+                      A0_MOV,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      flags, 0, negate(tmp, 1, 1, 1, 1), 0, 0);
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      emit_simple_arith(p, inst, A0_MOV, 1);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      emit_simple_arith(p, inst, A0_MUL, 2);
+      break;
+
+   case TGSI_OPCODE_POW:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      tmp = i915_get_utemp(p);
+      flags = get_result_flags(inst);
+
+      /* XXX: masking on intermediate values, here and elsewhere.
+       */
+      i915_emit_arith(p,
+                      A0_LOG,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+
+      i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, tmp, src1, 0);
+
+      i915_emit_arith(p,
+                      A0_EXP,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      flags, 0, swizzle(tmp, X, X, X, X), 0, 0);
+      break;
+      
+   case TGSI_OPCODE_RET:
+      /* XXX: no-op? */
+      break;
+      
+   case TGSI_OPCODE_RCP:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+
+      i915_emit_arith(p,
+                      A0_RCP,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                         get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_RSQ:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+
+      i915_emit_arith(p,
+                      A0_RSQ,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      /* 
+       * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
+       * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x
+       * t1 = MUL t0.xyyw t0.yz11    ; x^7 x^5 x^3 x
+       * scs.x = DP4 t1, sin_constants
+       * t1 = MUL t0.xxz1 t0.z111    ; x^6 x^4 x^2 1
+       * scs.y = DP4 t1, cos_constants
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XY, 0,
+                      swizzle(src0, X, X, ONE, ONE),
+                      swizzle(src0, X, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(tmp, X, Y, X, Y),
+                      swizzle(tmp, X, X, ONE, ONE), 0);
+
+      writemask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+
+      if (writemask & TGSI_WRITEMASK_Y) {
+         uint tmp1;
+
+         if (writemask & TGSI_WRITEMASK_X)
+            tmp1 = i915_get_utemp(p);
+         else
+            tmp1 = tmp;
+
+         i915_emit_arith(p,
+                         A0_MUL,
+                         tmp1, A0_DEST_CHANNEL_ALL, 0,
+                         swizzle(tmp, X, Y, Y, W),
+                         swizzle(tmp, X, Z, ONE, ONE), 0);
+
+         i915_emit_arith(p,
+                         A0_DP4,
+                         get_result_vector(p, &inst->FullDstRegisters[0]),
+                         A0_DEST_CHANNEL_Y, 0,
+                         swizzle(tmp1, W, Z, Y, X),
+                         i915_emit_const4fv(p, sin_constants), 0);
+      }
+
+      if (writemask & TGSI_WRITEMASK_X) {
+         i915_emit_arith(p,
+                         A0_MUL,
+                         tmp, A0_DEST_CHANNEL_XYZ, 0,
+                         swizzle(tmp, X, X, Z, ONE),
+                         swizzle(tmp, Z, ONE, ONE, ONE), 0);
+
+         i915_emit_arith(p,
+                         A0_DP4,
+                         get_result_vector(p, &inst->FullDstRegisters[0]),
+                         A0_DEST_CHANNEL_X, 0,
+                         swizzle(tmp, ONE, Z, Y, X),
+                         i915_emit_const4fv(p, cos_constants), 0);
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+      emit_simple_arith(p, inst, A0_SGE, 2);
+      break;
+
+   case TGSI_OPCODE_SLE:
+      /* like SGE, but swap reg0, reg1 */
+      emit_simple_arith_swap2(p, inst, A0_SGE, 2);
+      break;
+
+   case TGSI_OPCODE_SIN:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0);
+
+      i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
+
+      /* By choosing different taylor constants, could get rid of this mul:
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      tmp, i915_emit_const1f(p, (float) (M_PI * 2.0)), 0);
+
+      /* 
+       * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
+       * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x
+       * t1 = MUL t0.xyyw t0.yz11    ; x^7 x^5 x^3 x
+       * result = DP4 t1.wzyx, sin_constants
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XY, 0,
+                      swizzle(tmp, X, X, ONE, ONE),
+                      swizzle(tmp, X, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(tmp, X, Y, X, Y),
+                      swizzle(tmp, X, X, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(tmp, X, Y, Y, W),
+                      swizzle(tmp, X, Z, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_DP4,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(tmp, W, Z, Y, X),
+                      i915_emit_const4fv(p, sin_constants), 0);
+      break;
+
+   case TGSI_OPCODE_SLT:
+      emit_simple_arith(p, inst, A0_SLT, 2);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      /* like SLT, but swap reg0, reg1 */
+      emit_simple_arith_swap2(p, inst, A0_SLT, 2);
+      break;
+
+   case TGSI_OPCODE_SUB:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+
+      i915_emit_arith(p,
+                      A0_ADD,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      src0, negate(src1, 1, 1, 1, 1), 0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      emit_tex(p, inst, T0_TEXLD);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      emit_tex(p, inst, T0_TEXLDB);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      emit_tex(p, inst, T0_TEXLDP);
+      break;
+
+   case TGSI_OPCODE_XPD:
+      /* Cross product:
+       *      result.x = src0.y * src1.z - src0.z * src1.y;
+       *      result.y = src0.z * src1.x - src0.x * src1.z;
+       *      result.z = src0.x * src1.y - src0.y * src1.x;
+       *      result.w = undef;
+       */
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(src0, Z, X, Y, ONE),
+                      swizzle(src1, Y, Z, X, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MAD,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, Y, Z, X, ONE),
+                      swizzle(src1, Z, X, Y, ONE),
+                      negate(tmp, 1, 1, 1, 0));
+      break;
+
+   default:
+      i915_program_error(p, "bad opcode %d", inst->Instruction.Opcode);
+      p->error = 1;
+      return;
+   }
+
+   i915_release_utemps(p);
+}
+
+
+/**
+ * Translate TGSI fragment shader into i915 hardware instructions.
+ * \param p  the translation state
+ * \param tokens  the TGSI token array
+ */
+static void
+i915_translate_instructions(struct i915_fp_compile *p,
+                            const struct tgsi_token *tokens)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   struct tgsi_parse_context parse;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullToken.FullDeclaration.Declaration.File
+                  == TGSI_FILE_CONSTANT) {
+            uint i;
+            for (i = parse.FullToken.FullDeclaration.DeclarationRange.First;
+                 i <= parse.FullToken.FullDeclaration.DeclarationRange.Last;
+                 i++) {
+               assert(ifs->constant_flags[i] == 0x0);
+               ifs->constant_flags[i] = I915_CONSTFLAG_USER;
+               ifs->num_constants = MAX2(ifs->num_constants, i + 1);
+            }
+         }
+         else if (parse.FullToken.FullDeclaration.Declaration.File
+                  == TGSI_FILE_TEMPORARY) {
+            uint i;
+            for (i = parse.FullToken.FullDeclaration.DeclarationRange.First;
+                 i <= parse.FullToken.FullDeclaration.DeclarationRange.Last;
+                 i++) {
+               assert(i < I915_MAX_TEMPORARY);
+               /* XXX just use shader->info->file_mask[TGSI_FILE_TEMPORARY] */
+               p->temp_flag |= (1 << i); /* mark temp as used */
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            const struct tgsi_full_immediate *imm
+               = &parse.FullToken.FullImmediate;
+            const uint pos = p->num_immediates++;
+            uint j;
+            for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
+               p->immediates[pos][j] = imm->u.ImmediateFloat32[j].Float;
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (p->first_instruction) {
+            /* resolve location of immediates */
+            uint i, j;
+            for (i = 0; i < p->num_immediates; i++) {
+               /* find constant slot for this immediate */
+               for (j = 0; j < I915_MAX_CONSTANT; j++) {
+                  if (ifs->constant_flags[j] == 0x0) {
+                     memcpy(ifs->constants[j],
+                            p->immediates[i],
+                            4 * sizeof(float));
+                     /*printf("immediate %d maps to const %d\n", i, j);*/
+                     ifs->constant_flags[j] = 0xf;  /* all four comps used */
+                     p->immediates_map[i] = j;
+                     ifs->num_constants = MAX2(ifs->num_constants, j + 1);
+                     break;
+                  }
+               }
+            }
+
+            p->first_instruction = FALSE;
+         }
+
+         i915_translate_instruction(p, &parse.FullToken.FullInstruction);
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+   } /* while */
+
+   tgsi_parse_free (&parse);
+}
+
+
+static struct i915_fp_compile *
+i915_init_compile(struct i915_context *i915,
+                  struct i915_fragment_shader *ifs)
+{
+   struct i915_fp_compile *p = CALLOC_STRUCT(i915_fp_compile);
+
+   p->shader = ifs;
+
+   /* Put new constants at end of const buffer, growing downward.
+    * The problem is we don't know how many user-defined constants might
+    * be specified with pipe->set_constant_buffer().
+    * Should pre-scan the user's program to determine the highest-numbered
+    * constant referenced.
+    */
+   ifs->num_constants = 0;
+   memset(ifs->constant_flags, 0, sizeof(ifs->constant_flags));
+
+   p->first_instruction = TRUE;
+
+   p->nr_tex_indirect = 1;      /* correct? */
+   p->nr_tex_insn = 0;
+   p->nr_alu_insn = 0;
+   p->nr_decl_insn = 0;
+
+   p->csr = p->program;
+   p->decl = p->declarations;
+   p->decl_s = 0;
+   p->decl_t = 0;
+   p->temp_flag = ~0x0 << I915_MAX_TEMPORARY;
+   p->utemp_flag = ~0x7;
+
+   p->wpos_tex = -1;
+
+   /* initialize the first program word */
+   *(p->decl++) = _3DSTATE_PIXEL_SHADER_PROGRAM;
+
+   return p;
+}
+
+
+/* Copy compile results to the fragment program struct and destroy the
+ * compilation context.
+ */
+static void
+i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned long program_size = (unsigned long) (p->csr - p->program);
+   unsigned long decl_size = (unsigned long) (p->decl - p->declarations);
+
+   if (p->nr_tex_indirect > I915_MAX_TEX_INDIRECT)
+      i915_program_error(p, "Exceeded max nr indirect texture lookups");
+
+   if (p->nr_tex_insn > I915_MAX_TEX_INSN)
+      i915_program_error(p, "Exceeded max TEX instructions");
+
+   if (p->nr_alu_insn > I915_MAX_ALU_INSN)
+      i915_program_error(p, "Exceeded max ALU instructions");
+
+   if (p->nr_decl_insn > I915_MAX_DECL_INSN)
+      i915_program_error(p, "Exceeded max DECL instructions");
+
+   if (p->error) {
+      p->NumNativeInstructions = 0;
+      p->NumNativeAluInstructions = 0;
+      p->NumNativeTexInstructions = 0;
+      p->NumNativeTexIndirections = 0;
+
+      i915_use_passthrough_shader(ifs);
+   }
+   else {
+      p->NumNativeInstructions
+         = p->nr_alu_insn + p->nr_tex_insn + p->nr_decl_insn;
+      p->NumNativeAluInstructions = p->nr_alu_insn;
+      p->NumNativeTexInstructions = p->nr_tex_insn;
+      p->NumNativeTexIndirections = p->nr_tex_indirect;
+
+      /* patch in the program length */
+      p->declarations[0] |= program_size + decl_size - 2;
+
+      /* Copy compilation results to fragment program struct: 
+       */
+      assert(!ifs->program);
+      ifs->program
+         = (uint *) MALLOC((program_size + decl_size) * sizeof(uint));
+      if (ifs->program) {
+         ifs->program_len = program_size + decl_size;
+
+         memcpy(ifs->program,
+                p->declarations, 
+                decl_size * sizeof(uint));
+
+         memcpy(ifs->program + decl_size, 
+                p->program, 
+                program_size * sizeof(uint));
+      }
+   }
+
+   /* Release the compilation struct: 
+    */
+   FREE(p);
+}
+
+
+/**
+ * Find an unused texture coordinate slot to use for fragment WPOS.
+ * Update p->fp->wpos_tex with the result (-1 if no used texcoord slot is found).
+ */
+static void
+i915_find_wpos_space(struct i915_fp_compile *p)
+{
+#if 0
+   const uint inputs
+      = p->shader->inputs_read | (1 << TGSI_ATTRIB_POS); /*XXX hack*/
+   uint i;
+
+   p->wpos_tex = -1;
+
+   if (inputs & (1 << TGSI_ATTRIB_POS)) {
+      for (i = 0; i < I915_TEX_UNITS; i++) {
+	 if ((inputs & (1 << (TGSI_ATTRIB_TEX0 + i))) == 0) {
+	    p->wpos_tex = i;
+	    return;
+	 }
+      }
+
+      i915_program_error(p, "No free texcoord for wpos value");
+   }
+#else
+   if (p->shader->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
+      /* frag shader using the fragment position input */
+#if 0
+      assert(0);
+#endif
+   }
+#endif
+}
+
+
+
+
+/**
+ * Rather than trying to intercept and jiggle depth writes during
+ * emit, just move the value into its correct position at the end of
+ * the program:
+ */
+static void
+i915_fixup_depth_write(struct i915_fp_compile *p)
+{
+   /* XXX assuming pos/depth is always in output[0] */
+   if (p->shader->info.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
+      const uint depth = UREG(REG_TYPE_OD, 0);
+
+      i915_emit_arith(p,
+                      A0_MOV,                     /* opcode */
+                      depth,                      /* dest reg */
+                      A0_DEST_CHANNEL_W,          /* write mask */
+                      0,                          /* saturate? */
+                      swizzle(depth, X, Y, Z, Z), /* src0 */
+                      0, 0 /* src1, src2 */);
+   }
+}
+
+
+void
+i915_translate_fragment_program( struct i915_context *i915,
+                                 struct i915_fragment_shader *fs)
+{
+   struct i915_fp_compile *p = i915_init_compile(i915, fs);
+   const struct tgsi_token *tokens = fs->state.tokens;
+
+   i915_find_wpos_space(p);
+
+#if 0
+   tgsi_dump(tokens, 0);
+#endif
+
+   i915_translate_instructions(p, tokens);
+   i915_fixup_depth_write(p);
+
+   i915_fini_compile(i915, p);
+}
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
new file mode 100644
index 0000000000..8f1f58b2dd
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -0,0 +1,220 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "draw/draw_pipe.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+#include "i915_batch.h"
+
+
+
+/**
+ * Primitive emit to hardware.  No support for vertex buffers or any
+ * nice fast paths.
+ */
+struct setup_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct i915_context *i915;   
+};
+
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+{
+   return (struct setup_stage *)stage;
+}
+
+
+/**
+ * Extract the needed fields from vertex_header and emit i915 dwords.
+ * Recall that the vertices are constructed by the 'draw' module and
+ * have a couple of slots at the beginning (1-dword header, 4-dword
+ * clip pos) that we ignore here.
+ */
+static INLINE void
+emit_hw_vertex( struct i915_context *i915,
+                const struct vertex_header *vertex)
+{
+   const struct vertex_info *vinfo = &i915->current.vertex_info;
+   uint i;
+   uint count = 0;  /* for debug/sanity */
+
+   assert(!i915->dirty);
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      const uint j = vinfo->attrib[i].src_index;
+      const float *attrib = vertex->data[j];
+      switch (vinfo->attrib[i].emit) {
+      case EMIT_1F:
+         OUT_BATCH( fui(attrib[0]) );
+         count++;
+         break;
+      case EMIT_2F:
+         OUT_BATCH( fui(attrib[0]) );
+         OUT_BATCH( fui(attrib[1]) );
+         count += 2;
+         break;
+      case EMIT_3F:
+         OUT_BATCH( fui(attrib[0]) );
+         OUT_BATCH( fui(attrib[1]) );
+         OUT_BATCH( fui(attrib[2]) );
+         count += 3;
+         break;
+      case EMIT_4F:
+         OUT_BATCH( fui(attrib[0]) );
+         OUT_BATCH( fui(attrib[1]) );
+         OUT_BATCH( fui(attrib[2]) );
+         OUT_BATCH( fui(attrib[3]) );
+         count += 4;
+         break;
+      case EMIT_4UB:
+         OUT_BATCH( pack_ub4(float_to_ubyte( attrib[2] ),
+                             float_to_ubyte( attrib[1] ),
+                             float_to_ubyte( attrib[0] ),
+                             float_to_ubyte( attrib[3] )) );
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   assert(count == vinfo->size);
+}
+
+
+
+static INLINE void 
+emit_prim( struct draw_stage *stage, 
+	   struct prim_header *prim,
+	   unsigned hwprim,
+	   unsigned nr )
+{
+   struct i915_context *i915 = setup_stage(stage)->i915;
+   unsigned vertex_size;
+   unsigned i;
+
+   if (i915->dirty)
+      i915_update_derived( i915 );
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state( i915 );
+
+   /* need to do this after validation! */
+   vertex_size = i915->current.vertex_info.size * 4; /* in bytes */
+   assert(vertex_size >= 12); /* never smaller than 12 bytes */
+
+   if (!BEGIN_BATCH( 1 + nr * vertex_size / 4, 0 )) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush: 
+       */
+      i915_update_derived( i915 );
+      i915_emit_hardware_state( i915 );
+
+      if (!BEGIN_BATCH( 1 + nr * vertex_size / 4, 0 )) {
+	 assert(0);
+	 return;
+      }
+   }
+
+   /* Emit each triangle as a single primitive.  I told you this was
+    * simple.
+    */
+   OUT_BATCH(_3DPRIMITIVE | 
+	     hwprim |
+	     ((4 + vertex_size * nr)/4 - 2));
+
+   for (i = 0; i < nr; i++)
+      emit_hw_vertex(i915, prim->v[i]);
+}
+
+
+static void 
+setup_tri( struct draw_stage *stage, struct prim_header *prim )
+{
+   emit_prim( stage, prim, PRIM3D_TRILIST, 3 );
+}
+
+
+static void
+setup_line(struct draw_stage *stage, struct prim_header *prim)
+{
+   emit_prim( stage, prim, PRIM3D_LINELIST, 2 );
+}
+
+
+static void
+setup_point(struct draw_stage *stage, struct prim_header *prim)
+{
+   emit_prim( stage, prim, PRIM3D_POINTLIST, 1 );
+}
+
+
+static void setup_flush( struct draw_stage *stage, unsigned flags )
+{
+}
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+}
+
+static void render_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.  This gets plugged into
+ * the 'draw' module's pipeline.
+ */
+struct draw_stage *i915_draw_render_stage( struct i915_context *i915 )
+{
+   struct setup_stage *setup = CALLOC_STRUCT(setup_stage);
+
+   setup->i915 = i915;
+   setup->stage.draw = i915->draw;
+   setup->stage.point = setup_point;
+   setup->stage.line = setup_line;
+   setup->stage.tri = setup_tri;
+   setup->stage.flush = setup_flush;
+   setup->stage.reset_stipple_counter = reset_stipple_counter;
+   setup->stage.destroy = render_destroy;
+
+   return &setup->stage;
+}
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
new file mode 100644
index 0000000000..f49f6d6ed1
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -0,0 +1,545 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <jrfonseca@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_winsys.h"
+#include "i915_batch.h"
+#include "i915_state.h"
+
+
+/**
+ * Primitive renderer for i915.
+ */
+struct i915_vbuf_render {
+   struct vbuf_render base;
+
+   struct i915_context *i915;   
+
+   /** Vertex size in bytes */
+   unsigned vertex_size;
+
+   /** Software primitive */
+   unsigned prim;
+
+   /** Hardware primitive */
+   unsigned hwprim;
+
+   /** Genereate a vertex list */
+   unsigned fallback;
+
+   /* Stuff for the vbo */
+   struct pipe_buffer *vbo;
+   size_t vbo_size;
+   size_t vbo_offset;
+   void *vbo_ptr;
+   size_t vbo_alloc_size;
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct i915_vbuf_render *
+i915_vbuf_render( struct vbuf_render *render )
+{
+   assert(render);
+   return (struct i915_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+i915_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915->dirty) {
+      /* make sure we have up to date vertex layout */
+      i915_update_derived( i915 );
+   }
+
+   return &i915->current.vertex_info;
+}
+
+
+static void *
+i915_vbuf_render_allocate_vertices( struct vbuf_render *render,
+                                    ushort vertex_size,
+                                    ushort nr_vertices )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   struct pipe_screen *screen = i915->pipe.screen;
+   size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+
+   /* FIXME: handle failure */
+   assert(!i915->vbo);
+
+   if (i915_render->vbo_size > size + i915_render->vbo_offset && !i915->vbo_flushed) {
+   } else {
+      i915->vbo_flushed = 0;
+      pipe_buffer_reference(screen, &i915_render->vbo, NULL);
+   }
+
+   if (!i915_render->vbo) {
+      i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
+      i915_render->vbo_offset = 0;
+      i915_render->vbo = pipe_buffer_create(screen,
+                                            64,
+                                            I915_BUFFER_USAGE_LIT_VERTEX,
+                                            i915_render->vbo_size);
+      i915_render->vbo_ptr = pipe_buffer_map(screen,
+                                             i915_render->vbo,
+                                             PIPE_BUFFER_USAGE_CPU_WRITE);
+      pipe_buffer_unmap(screen, i915_render->vbo);
+   }
+
+   i915->vbo = i915_render->vbo;
+   i915->vbo_offset = i915_render->vbo_offset;
+   i915->dirty |= I915_NEW_VBO;
+
+   return (unsigned char *)i915_render->vbo_ptr + i915->vbo_offset;
+}
+
+
+static boolean
+i915_vbuf_render_set_primitive( struct vbuf_render *render, 
+                                unsigned prim )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   i915_render->prim = prim;
+
+   switch(prim) {
+   case PIPE_PRIM_POINTS:
+      i915_render->hwprim = PRIM3D_POINTLIST;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_LINES:
+      i915_render->hwprim = PRIM3D_LINELIST;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_LINE_LOOP:
+      i915_render->hwprim = PRIM3D_LINELIST;
+      i915_render->fallback = PIPE_PRIM_LINE_LOOP;
+      return TRUE;
+   case PIPE_PRIM_LINE_STRIP:
+      i915_render->hwprim = PRIM3D_LINESTRIP;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_TRIANGLES:
+      i915_render->hwprim = PRIM3D_TRILIST;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      i915_render->hwprim = PRIM3D_TRISTRIP;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      i915_render->hwprim = PRIM3D_TRIFAN;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_QUADS:
+      i915_render->hwprim = PRIM3D_TRILIST;
+      i915_render->fallback = PIPE_PRIM_QUADS;
+      return TRUE;
+   case PIPE_PRIM_QUAD_STRIP:
+      i915_render->hwprim = PRIM3D_TRILIST;
+      i915_render->fallback = PIPE_PRIM_QUAD_STRIP;
+      return TRUE;
+   case PIPE_PRIM_POLYGON:
+      i915_render->hwprim = PRIM3D_POLY;
+      i915_render->fallback = 0;
+      return TRUE;
+   default:
+      /* FIXME: Actually, can handle a lot more just fine... */
+      return FALSE;
+   }
+}
+
+
+
+/**
+ * Used for fallbacks in draw_arrays
+ */
+static void
+draw_arrays_generate_indices( struct vbuf_render *render,
+                              unsigned start, uint nr,
+                              unsigned type )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned i;
+   unsigned end = start + nr;
+   switch(type) {
+   case 0:
+      for (i = start; i+1 < end; i += 2)
+	 OUT_BATCH( (i+0) | (i+1) << 16 );
+      if (i < end)
+	 OUT_BATCH( i );
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr >= 2) {
+	 for (i = start + 1; i < end; i++)
+	    OUT_BATCH( (i-0) | (i+0) << 16 );
+	 OUT_BATCH( (i-0) | (  start) << 16 );
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = start; i + 3 < end; i += 4) {
+	 OUT_BATCH( (i+0) | (i+1) << 16 );
+	 OUT_BATCH( (i+3) | (i+1) << 16 );
+	 OUT_BATCH( (i+2) | (i+3) << 16 );
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = start; i + 3 < end; i += 2) {
+	 OUT_BATCH( (i+0) | (i+1) << 16 );
+	 OUT_BATCH( (i+3) | (i+2) << 16 );
+	 OUT_BATCH( (i+0) | (i+3) << 16 );
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+static unsigned
+draw_arrays_calc_nr_indices( uint nr, unsigned type )
+{
+   switch (type) {
+   case 0:
+      return nr;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr >= 2)
+	 return nr * 2;
+      else
+	 return 0;
+   case PIPE_PRIM_QUADS:
+      return (nr / 4) * 6;
+   case PIPE_PRIM_QUAD_STRIP:
+      return ((nr - 2) / 2) * 6;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static void
+draw_arrays_fallback( struct vbuf_render *render,
+                      unsigned start,
+                      uint nr )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned nr_indices;
+
+   if (i915->dirty)
+      i915_update_derived( i915 );
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state( i915 );
+
+   nr_indices = draw_arrays_calc_nr_indices( nr, i915_render->fallback );
+   if (!nr_indices)
+      return;
+
+   if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush:
+       */
+      i915_update_derived( i915 );
+      i915_emit_hardware_state( i915 );
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+	 assert(0);
+	 goto out;
+      }
+   }
+   OUT_BATCH( _3DPRIMITIVE |
+	      PRIM_INDIRECT |
+	      i915_render->hwprim |
+	      PRIM_INDIRECT_ELTS |
+	      nr_indices );
+
+   draw_arrays_generate_indices( render, start, nr, i915_render->fallback );
+
+out:
+   return;
+}
+
+static void
+i915_vbuf_render_draw_arrays( struct vbuf_render *render,
+                              unsigned start,
+                              uint nr )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+
+   if (i915_render->fallback) {
+      draw_arrays_fallback( render, start, nr );
+      return;
+   }
+
+   /* JB: TODO submit direct cmds */
+   draw_arrays_fallback( render, start, nr );
+}
+
+/**
+ * Used for normal and fallback emitting of indices
+ * If type is zero normal operation assumed.
+ */
+static void
+draw_generate_indices( struct vbuf_render *render,
+                       const ushort *indices,
+                       uint nr_indices,
+                       unsigned type )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned i;
+
+   switch(type) {
+   case 0:
+      for (i = 0; i + 1 < nr_indices; i += 2) {
+	 OUT_BATCH( indices[i] | indices[i+1] << 16 );
+      }
+      if (i < nr_indices) {
+	 OUT_BATCH( indices[i] );
+      }
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr_indices >= 2) {
+	 for (i = 1; i < nr_indices; i++)
+	    OUT_BATCH( indices[i-1] | indices[i] << 16 );
+	 OUT_BATCH( indices[i-1] | indices[0] << 16 );
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = 0; i + 3 < nr_indices; i += 4) {
+	 OUT_BATCH( indices[i+0] | indices[i+1] << 16 );
+	 OUT_BATCH( indices[i+3] | indices[i+1] << 16 );
+	 OUT_BATCH( indices[i+2] | indices[i+3] << 16 );
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 0; i + 3 < nr_indices; i += 2) {
+	 OUT_BATCH( indices[i+0] | indices[i+1] << 16 );
+	 OUT_BATCH( indices[i+3] | indices[i+2] << 16 );
+	 OUT_BATCH( indices[i+0] | indices[i+3] << 16 );
+      }
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+static unsigned
+draw_calc_nr_indices( uint nr_indices, unsigned type )
+{
+   switch (type) {
+   case 0:
+      return nr_indices;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr_indices >= 2)
+	 return nr_indices * 2;
+      else
+	 return 0;
+   case PIPE_PRIM_QUADS:
+      return (nr_indices / 4) * 6;
+   case PIPE_PRIM_QUAD_STRIP:
+      return ((nr_indices - 2) / 2) * 6;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static void 
+i915_vbuf_render_draw( struct vbuf_render *render,
+                       const ushort *indices,
+                       uint nr_indices)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned save_nr_indices;
+
+   save_nr_indices = nr_indices;
+
+   nr_indices = draw_calc_nr_indices( nr_indices, i915_render->fallback );
+   if (!nr_indices)
+      return;
+
+   if (i915->dirty)
+      i915_update_derived( i915 );
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state( i915 );
+
+   if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush: 
+       */
+      i915_update_derived( i915 );
+      i915_emit_hardware_state( i915 );
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+	 assert(0);
+     goto out;
+      }
+   }
+
+   OUT_BATCH( _3DPRIMITIVE |
+	      PRIM_INDIRECT |
+	      i915_render->hwprim |
+	      PRIM_INDIRECT_ELTS |
+	      nr_indices );
+   draw_generate_indices( render,
+			  indices,
+			  save_nr_indices,
+			  i915_render->fallback );
+
+out:
+   return;
+}
+
+
+static void
+i915_vbuf_render_release_vertices( struct vbuf_render *render,
+			           void *vertices, 
+			           unsigned vertex_size,
+			           unsigned vertices_used )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   size_t size = (size_t)vertex_size * (size_t)vertices_used;
+
+   assert(i915->vbo);
+
+   i915_render->vbo_offset += size;
+   i915->vbo = NULL;
+   i915->dirty |= I915_NEW_VBO;
+}
+
+
+static void
+i915_vbuf_render_destroy( struct vbuf_render *render )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   FREE(i915_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+i915_vbuf_render_create( struct i915_context *i915 )
+{
+   struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
+   struct pipe_screen *screen = i915->pipe.screen;
+
+   i915_render->i915 = i915;
+   
+   i915_render->base.max_vertex_buffer_bytes = 128*1024;
+   
+   /* NOTE: it must be such that state and vertices indices fit in a single 
+    * batch buffer.
+    */
+   i915_render->base.max_indices = 16*1024;
+
+   i915_render->base.get_vertex_info = i915_vbuf_render_get_vertex_info;
+   i915_render->base.allocate_vertices = i915_vbuf_render_allocate_vertices;
+   i915_render->base.set_primitive = i915_vbuf_render_set_primitive;
+   i915_render->base.draw = i915_vbuf_render_draw;
+   i915_render->base.draw_arrays = i915_vbuf_render_draw_arrays;
+   i915_render->base.release_vertices = i915_vbuf_render_release_vertices;
+   i915_render->base.destroy = i915_vbuf_render_destroy;
+
+   i915_render->vbo_alloc_size = 128 * 4096;
+   i915_render->vbo_size = i915_render->vbo_alloc_size;
+   i915_render->vbo_offset = 0;
+   i915_render->vbo = pipe_buffer_create(screen,
+                                         64,
+                                         I915_BUFFER_USAGE_LIT_VERTEX,
+                                         i915_render->vbo_size);
+   i915_render->vbo_ptr = pipe_buffer_map(screen,
+                                          i915_render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+   pipe_buffer_unmap(screen, i915_render->vbo);
+
+   return &i915_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *i915_draw_vbuf_stage( struct i915_context *i915 )
+{
+   struct vbuf_render *render;
+   struct draw_stage *stage;
+   
+   render = i915_vbuf_render_create(i915);
+   if(!render)
+      return NULL;
+   
+   stage = draw_vbuf_stage( i915->draw, render );
+   if(!stage) {
+      render->destroy(render);
+      return NULL;
+   }
+   /** TODO JB: this shouldn't be here */
+   draw_set_render(i915->draw, render);
+
+   return stage;
+}
diff --git a/src/gallium/drivers/i915simple/i915_reg.h b/src/gallium/drivers/i915simple/i915_reg.h
new file mode 100644
index 0000000000..04620fec68
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_reg.h
@@ -0,0 +1,978 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef I915_REG_H
+#define I915_REG_H
+
+
+#define I915_SET_FIELD( var, mask, value ) (var &= ~(mask), var |= value)
+
+#define CMD_3D (0x3<<29)
+
+#define PRIM3D_INLINE		(CMD_3D | (0x1f<<24))
+#define PRIM3D_TRILIST		(0x0<<18)
+#define PRIM3D_TRISTRIP 	(0x1<<18)
+#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
+#define PRIM3D_TRIFAN		(0x3<<18)
+#define PRIM3D_POLY		(0x4<<18)
+#define PRIM3D_LINELIST 	(0x5<<18)
+#define PRIM3D_LINESTRIP	(0x6<<18)
+#define PRIM3D_RECTLIST 	(0x7<<18)
+#define PRIM3D_POINTLIST	(0x8<<18)
+#define PRIM3D_DIB		(0x9<<18)
+#define PRIM3D_CLEAR_RECT	(0xa<<18)
+#define PRIM3D_ZONE_INIT	(0xd<<18)
+#define PRIM3D_MASK		(0x1f<<18)
+
+/* p137 */
+#define _3DSTATE_AA_CMD			(CMD_3D | (0x06<<24))
+#define AA_LINE_ECAAR_WIDTH_ENABLE	(1<<16)
+#define AA_LINE_ECAAR_WIDTH_0_5 	0
+#define AA_LINE_ECAAR_WIDTH_1_0		(1<<14)
+#define AA_LINE_ECAAR_WIDTH_2_0 	(2<<14)
+#define AA_LINE_ECAAR_WIDTH_4_0 	(3<<14)
+#define AA_LINE_REGION_WIDTH_ENABLE	(1<<8)
+#define AA_LINE_REGION_WIDTH_0_5	0
+#define AA_LINE_REGION_WIDTH_1_0	(1<<6)
+#define AA_LINE_REGION_WIDTH_2_0	(2<<6)
+#define AA_LINE_REGION_WIDTH_4_0	(3<<6)
+
+/* 3DSTATE_BACKFACE_STENCIL_OPS, p138*/
+#define _3DSTATE_BACKFACE_STENCIL_OPS    (CMD_3D | (0x8<<24))
+#define BFO_ENABLE_STENCIL_REF          (1<<23)
+#define BFO_STENCIL_REF_SHIFT           15
+#define BFO_STENCIL_REF_MASK            (0xff<<15)
+#define BFO_ENABLE_STENCIL_FUNCS        (1<<14)
+#define BFO_STENCIL_TEST_SHIFT          11
+#define BFO_STENCIL_TEST_MASK           (0x7<<11)
+#define BFO_STENCIL_FAIL_SHIFT          8
+#define BFO_STENCIL_FAIL_MASK           (0x7<<8)
+#define BFO_STENCIL_PASS_Z_FAIL_SHIFT   5
+#define BFO_STENCIL_PASS_Z_FAIL_MASK    (0x7<<5)
+#define BFO_STENCIL_PASS_Z_PASS_SHIFT   2
+#define BFO_STENCIL_PASS_Z_PASS_MASK    (0x7<<2)
+#define BFO_ENABLE_STENCIL_TWO_SIDE     (1<<1)
+#define BFO_STENCIL_TWO_SIDE            (1<<0)
+
+
+/* 3DSTATE_BACKFACE_STENCIL_MASKS, p140 */
+#define _3DSTATE_BACKFACE_STENCIL_MASKS    (CMD_3D | (0x9<<24))
+#define BFM_ENABLE_STENCIL_TEST_MASK      (1<<17)
+#define BFM_ENABLE_STENCIL_WRITE_MASK     (1<<16)
+#define BFM_STENCIL_TEST_MASK_SHIFT       8
+#define BFM_STENCIL_TEST_MASK_MASK        (0xff<<8)
+#define BFM_STENCIL_WRITE_MASK_SHIFT      0
+#define BFM_STENCIL_WRITE_MASK_MASK       (0xff<<0)
+
+
+
+/* 3DSTATE_BIN_CONTROL p141 */
+
+/* p143 */
+#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
+/* Dword 1 */
+#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
+#define BUF_3D_ID_DEPTH 	(0x7<<24)
+#define BUF_3D_USE_FENCE	(1<<23)
+#define BUF_3D_TILED_SURFACE	(1<<22)
+#define BUF_3D_TILE_WALK_X	0
+#define BUF_3D_TILE_WALK_Y	(1<<21)
+#define BUF_3D_PITCH(x)         (((x)/4)<<2)
+/* Dword 2 */
+#define BUF_3D_ADDR(x)		((x) & ~0x3)
+
+
+/* 3DSTATE_CHROMA_KEY */
+
+/* 3DSTATE_CLEAR_PARAMETERS, p150 */
+#define _3DSTATE_CLEAR_PARAMETERS	(CMD_3D | (0x1d<<24) | (0x9c<<16) | 5)
+/* Dword 1 */
+#define CLEARPARAM_CLEAR_RECT		(1 << 16)
+#define CLEARPARAM_ZONE_INIT		(0 << 16)
+#define CLEARPARAM_WRITE_COLOR		(1 << 2)
+#define CLEARPARAM_WRITE_DEPTH		(1 << 1)
+#define CLEARPARAM_WRITE_STENCIL	(1 << 0)
+
+/* 3DSTATE_CONSTANT_BLEND_COLOR, p153 */
+#define _3DSTATE_CONST_BLEND_COLOR_CMD	(CMD_3D | (0x1d<<24) | (0x88<<16))
+
+
+
+/* 3DSTATE_COORD_SET_BINDINGS, p154 */
+#define _3DSTATE_COORD_SET_BINDINGS      (CMD_3D | (0x16<<24))
+#define CSB_TCB(iunit, eunit)           ((eunit)<<(iunit*3))
+
+/* p156 */
+#define _3DSTATE_DFLT_DIFFUSE_CMD	(CMD_3D | (0x1d<<24) | (0x99<<16))
+
+/* p157 */
+#define _3DSTATE_DFLT_SPEC_CMD		(CMD_3D | (0x1d<<24) | (0x9a<<16))
+
+/* p158 */
+#define _3DSTATE_DFLT_Z_CMD		(CMD_3D | (0x1d<<24) | (0x98<<16))
+
+
+/* 3DSTATE_DEPTH_OFFSET_SCALE, p159 */
+#define _3DSTATE_DEPTH_OFFSET_SCALE       (CMD_3D | (0x1d<<24) | (0x97<<16))
+/* scale in dword 1 */
+
+
+/* 3DSTATE_DEPTH_SUBRECT_DISABLE, p160 */
+#define _3DSTATE_DEPTH_SUBRECT_DISABLE    (CMD_3D | (0x1c<<24) | (0x11<<19) | 0x2)
+
+/* p161 */
+#define _3DSTATE_DST_BUF_VARS_CMD	(CMD_3D | (0x1d<<24) | (0x85<<16))
+/* Dword 1 */
+#define TEX_DEFAULT_COLOR_OGL           (0<<30)
+#define TEX_DEFAULT_COLOR_D3D           (1<<30)
+#define ZR_EARLY_DEPTH                  (1<<29)
+#define LOD_PRECLAMP_OGL                (1<<28)
+#define LOD_PRECLAMP_D3D                (0<<28)
+#define DITHER_FULL_ALWAYS              (0<<26)
+#define DITHER_FULL_ON_FB_BLEND         (1<<26)
+#define DITHER_CLAMPED_ALWAYS           (2<<26)
+#define LINEAR_GAMMA_BLEND_32BPP        (1<<25)
+#define DEBUG_DISABLE_ENH_DITHER        (1<<24)
+#define DSTORG_HORT_BIAS(x)		((x)<<20)
+#define DSTORG_VERT_BIAS(x)		((x)<<16)
+#define COLOR_4_2_2_CHNL_WRT_ALL	0
+#define COLOR_4_2_2_CHNL_WRT_Y		(1<<12)
+#define COLOR_4_2_2_CHNL_WRT_CR		(2<<12)
+#define COLOR_4_2_2_CHNL_WRT_CB		(3<<12)
+#define COLOR_4_2_2_CHNL_WRT_CRCB	(4<<12)
+#define COLOR_BUF_8BIT			0
+#define COLOR_BUF_RGB555 		(1<<8)
+#define COLOR_BUF_RGB565 		(2<<8)
+#define COLOR_BUF_ARGB8888		(3<<8)
+#define DEPTH_FRMT_16_FIXED		0
+#define DEPTH_FRMT_16_FLOAT		(1<<2)
+#define DEPTH_FRMT_24_FIXED_8_OTHER	(2<<2)
+#define VERT_LINE_STRIDE_1		(1<<1)
+#define VERT_LINE_STRIDE_0		(0<<1)
+#define VERT_LINE_STRIDE_OFS_1		1
+#define VERT_LINE_STRIDE_OFS_0		0
+
+/* p166 */
+#define _3DSTATE_DRAW_RECT_CMD		(CMD_3D|(0x1d<<24)|(0x80<<16)|3)
+/* Dword 1 */
+#define DRAW_RECT_DIS_DEPTH_OFS 	(1<<30)
+#define DRAW_DITHER_OFS_X(x)		((x)<<26)
+#define DRAW_DITHER_OFS_Y(x)		((x)<<24)
+/* Dword 2 */
+#define DRAW_YMIN(x)			((x)<<16)
+#define DRAW_XMIN(x)			(x)
+/* Dword 3 */
+#define DRAW_YMAX(x)			((x)<<16)
+#define DRAW_XMAX(x)			(x)
+/* Dword 4 */
+#define DRAW_YORG(x)			((x)<<16)
+#define DRAW_XORG(x)			(x)
+
+
+/* 3DSTATE_FILTER_COEFFICIENTS_4X4, p170 */
+
+/* 3DSTATE_FILTER_COEFFICIENTS_6X5, p172 */
+
+
+/* _3DSTATE_FOG_COLOR, p173 */
+#define _3DSTATE_FOG_COLOR_CMD		(CMD_3D|(0x15<<24))
+#define FOG_COLOR_RED(x)		((x)<<16)
+#define FOG_COLOR_GREEN(x)		((x)<<8)
+#define FOG_COLOR_BLUE(x)		(x)
+
+/* _3DSTATE_FOG_MODE, p174 */
+#define _3DSTATE_FOG_MODE_CMD		(CMD_3D|(0x1d<<24)|(0x89<<16)|2)
+/* Dword 1 */
+#define FMC1_FOGFUNC_MODIFY_ENABLE	(1<<31)
+#define FMC1_FOGFUNC_VERTEX		(0<<28)
+#define FMC1_FOGFUNC_PIXEL_EXP		(1<<28)
+#define FMC1_FOGFUNC_PIXEL_EXP2		(2<<28)
+#define FMC1_FOGFUNC_PIXEL_LINEAR	(3<<28)
+#define FMC1_FOGFUNC_MASK		(3<<28)
+#define FMC1_FOGINDEX_MODIFY_ENABLE     (1<<27)
+#define FMC1_FOGINDEX_Z		        (0<<25)
+#define FMC1_FOGINDEX_W   		(1<<25)
+#define FMC1_C1_C2_MODIFY_ENABLE	(1<<24)
+#define FMC1_DENSITY_MODIFY_ENABLE	(1<<23)
+#define FMC1_C1_ONE      	        (1<<13)
+#define FMC1_C1_MASK		        (0xffff<<4)
+/* Dword 2 */
+#define FMC2_C2_ONE		        (1<<16)
+/* Dword 3 */
+#define FMC3_D_ONE      		(1<<16)
+
+
+
+/* _3DSTATE_INDEPENDENT_ALPHA_BLEND, p177 */
+#define _3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD	(CMD_3D|(0x0b<<24))
+#define IAB_MODIFY_ENABLE	        (1<<23)
+#define IAB_ENABLE       	        (1<<22)
+#define IAB_MODIFY_FUNC         	(1<<21)
+#define IAB_FUNC_SHIFT          	16
+#define IAB_MODIFY_SRC_FACTOR   	(1<<11)
+#define IAB_SRC_FACTOR_SHIFT		6
+#define IAB_SRC_FACTOR_MASK		(BLENDFACT_MASK<<6)
+#define IAB_MODIFY_DST_FACTOR	        (1<<5)
+#define IAB_DST_FACTOR_SHIFT		0
+#define IAB_DST_FACTOR_MASK		(BLENDFACT_MASK<<0)
+
+
+#define BLENDFUNC_ADD			0x0
+#define BLENDFUNC_SUBTRACT		0x1
+#define BLENDFUNC_REVERSE_SUBTRACT	0x2
+#define BLENDFUNC_MIN			0x3
+#define BLENDFUNC_MAX			0x4
+#define BLENDFUNC_MASK			0x7
+
+/* 3DSTATE_LOAD_INDIRECT, p180 */
+
+#define _3DSTATE_LOAD_INDIRECT	        (CMD_3D|(0x1d<<24)|(0x7<<16))
+#define LI0_STATE_STATIC_INDIRECT       (0x01<<8)
+#define LI0_STATE_DYNAMIC_INDIRECT      (0x02<<8)
+#define LI0_STATE_SAMPLER               (0x04<<8)
+#define LI0_STATE_MAP                   (0x08<<8)
+#define LI0_STATE_PROGRAM               (0x10<<8)
+#define LI0_STATE_CONSTANTS             (0x20<<8)
+
+#define SIS0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define SIS0_FORCE_LOAD                 (1<<1)
+#define SIS0_BUFFER_VALID               (1<<0)
+#define SIS1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define DIS0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define DIS0_BUFFER_RESET               (1<<1)
+#define DIS0_BUFFER_VALID               (1<<0)
+
+#define SSB0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define SSB0_FORCE_LOAD                 (1<<1)
+#define SSB0_BUFFER_VALID               (1<<0)
+#define SSB1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define MSB0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define MSB0_FORCE_LOAD                 (1<<1)
+#define MSB0_BUFFER_VALID               (1<<0)
+#define MSB1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define PSP0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define PSP0_FORCE_LOAD                 (1<<1)
+#define PSP0_BUFFER_VALID               (1<<0)
+#define PSP1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define PSC0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define PSC0_FORCE_LOAD                 (1<<1)
+#define PSC0_BUFFER_VALID               (1<<0)
+#define PSC1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+
+
+
+
+/* _3DSTATE_RASTERIZATION_RULES */
+#define _3DSTATE_RASTER_RULES_CMD	(CMD_3D|(0x07<<24))
+#define ENABLE_POINT_RASTER_RULE	(1<<15)
+#define OGL_POINT_RASTER_RULE		(1<<13)
+#define ENABLE_TEXKILL_3D_4D            (1<<10)
+#define TEXKILL_3D                      (0<<9)
+#define TEXKILL_4D                      (1<<9)
+#define ENABLE_LINE_STRIP_PROVOKE_VRTX	(1<<8)
+#define ENABLE_TRI_FAN_PROVOKE_VRTX	(1<<5)
+#define LINE_STRIP_PROVOKE_VRTX(x)	((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX(x) 	((x)<<3)
+
+/* _3DSTATE_SCISSOR_ENABLE, p256 */
+#define _3DSTATE_SCISSOR_ENABLE_CMD	(CMD_3D|(0x1c<<24)|(0x10<<19))
+#define ENABLE_SCISSOR_RECT		((1<<1) | 1)
+#define DISABLE_SCISSOR_RECT		(1<<1)
+
+/* _3DSTATE_SCISSOR_RECTANGLE_0, p257 */
+#define _3DSTATE_SCISSOR_RECT_0_CMD	(CMD_3D|(0x1d<<24)|(0x81<<16)|1)
+/* Dword 1 */
+#define SCISSOR_RECT_0_YMIN(x)		((x)<<16)
+#define SCISSOR_RECT_0_XMIN(x)		(x)
+/* Dword 2 */
+#define SCISSOR_RECT_0_YMAX(x)		((x)<<16)
+#define SCISSOR_RECT_0_XMAX(x)		(x)
+
+/* p189 */
+#define _3DSTATE_LOAD_STATE_IMMEDIATE_1   ((0x3<<29)|(0x1d<<24)|(0x04<<16))
+#define I1_LOAD_S(n)                      (1<<(4+n))
+
+#define S0_VB_OFFSET_MASK              0xffffffc
+#define S0_AUTO_CACHE_INV_DISABLE      (1<<0)
+
+#define S1_VERTEX_WIDTH_SHIFT          24
+#define S1_VERTEX_WIDTH_MASK           (0x3f<<24)
+#define S1_VERTEX_PITCH_SHIFT          16
+#define S1_VERTEX_PITCH_MASK           (0x3f<<16)
+
+#define TEXCOORDFMT_2D                 0x0
+#define TEXCOORDFMT_3D                 0x1
+#define TEXCOORDFMT_4D                 0x2
+#define TEXCOORDFMT_1D                 0x3
+#define TEXCOORDFMT_2D_16              0x4
+#define TEXCOORDFMT_4D_16              0x5
+#define TEXCOORDFMT_NOT_PRESENT        0xf
+#define S2_TEXCOORD_FMT0_MASK            0xf
+#define S2_TEXCOORD_FMT1_SHIFT           4
+#define S2_TEXCOORD_FMT(unit, type)    ((type)<<(unit*4))
+#define S2_TEXCOORD_NONE               (~0)
+
+/* S3 not interesting */
+
+#define S4_POINT_WIDTH_SHIFT           23
+#define S4_POINT_WIDTH_MASK            (0x1ff<<23)
+#define S4_LINE_WIDTH_SHIFT            19
+#define S4_LINE_WIDTH_ONE              (0x2<<19)
+#define S4_LINE_WIDTH_MASK             (0xf<<19)
+#define S4_FLATSHADE_ALPHA             (1<<18)
+#define S4_FLATSHADE_FOG               (1<<17)
+#define S4_FLATSHADE_SPECULAR          (1<<16)
+#define S4_FLATSHADE_COLOR             (1<<15)
+#define S4_CULLMODE_BOTH	       (0<<13)
+#define S4_CULLMODE_NONE	       (1<<13)
+#define S4_CULLMODE_CW		       (2<<13)
+#define S4_CULLMODE_CCW		       (3<<13)
+#define S4_CULLMODE_MASK	       (3<<13)
+#define S4_VFMT_POINT_WIDTH            (1<<12)
+#define S4_VFMT_SPEC_FOG               (1<<11)
+#define S4_VFMT_COLOR                  (1<<10)
+#define S4_VFMT_DEPTH_OFFSET           (1<<9)
+#define S4_VFMT_XYZ     	       (1<<6)
+#define S4_VFMT_XYZW     	       (2<<6)
+#define S4_VFMT_XY     		       (3<<6)
+#define S4_VFMT_XYW     	       (4<<6)
+#define S4_VFMT_XYZW_MASK              (7<<6)
+#define S4_FORCE_DEFAULT_DIFFUSE       (1<<5)
+#define S4_FORCE_DEFAULT_SPECULAR      (1<<4)
+#define S4_LOCAL_DEPTH_OFFSET_ENABLE   (1<<3)
+#define S4_VFMT_FOG_PARAM              (1<<2)
+#define S4_SPRITE_POINT_ENABLE         (1<<1)
+#define S4_LINE_ANTIALIAS_ENABLE       (1<<0)
+
+#define S4_VFMT_MASK (S4_VFMT_POINT_WIDTH   | 	\
+		      S4_VFMT_SPEC_FOG      |	\
+		      S4_VFMT_COLOR         |	\
+		      S4_VFMT_DEPTH_OFFSET  |	\
+		      S4_VFMT_XYZW_MASK     |	\
+		      S4_VFMT_FOG_PARAM)
+
+
+#define S5_WRITEDISABLE_ALPHA          (1<<31)
+#define S5_WRITEDISABLE_RED            (1<<30)
+#define S5_WRITEDISABLE_GREEN          (1<<29)
+#define S5_WRITEDISABLE_BLUE           (1<<28)
+#define S5_WRITEDISABLE_MASK           (0xf<<28)
+#define S5_FORCE_DEFAULT_POINT_SIZE    (1<<27)
+#define S5_LAST_PIXEL_ENABLE           (1<<26)
+#define S5_GLOBAL_DEPTH_OFFSET_ENABLE  (1<<25)
+#define S5_FOG_ENABLE                  (1<<24)
+#define S5_STENCIL_REF_SHIFT           16
+#define S5_STENCIL_REF_MASK            (0xff<<16)
+#define S5_STENCIL_TEST_FUNC_SHIFT     13
+#define S5_STENCIL_TEST_FUNC_MASK      (0x7<<13)
+#define S5_STENCIL_FAIL_SHIFT          10
+#define S5_STENCIL_FAIL_MASK           (0x7<<10)
+#define S5_STENCIL_PASS_Z_FAIL_SHIFT   7
+#define S5_STENCIL_PASS_Z_FAIL_MASK    (0x7<<7)
+#define S5_STENCIL_PASS_Z_PASS_SHIFT   4
+#define S5_STENCIL_PASS_Z_PASS_MASK    (0x7<<4)
+#define S5_STENCIL_WRITE_ENABLE        (1<<3)
+#define S5_STENCIL_TEST_ENABLE         (1<<2)
+#define S5_COLOR_DITHER_ENABLE         (1<<1)
+#define S5_LOGICOP_ENABLE              (1<<0)
+
+
+#define S6_ALPHA_TEST_ENABLE           (1<<31)
+#define S6_ALPHA_TEST_FUNC_SHIFT       28
+#define S6_ALPHA_TEST_FUNC_MASK        (0x7<<28)
+#define S6_ALPHA_REF_SHIFT             20
+#define S6_ALPHA_REF_MASK              (0xff<<20)
+#define S6_DEPTH_TEST_ENABLE           (1<<19)
+#define S6_DEPTH_TEST_FUNC_SHIFT       16
+#define S6_DEPTH_TEST_FUNC_MASK        (0x7<<16)
+#define S6_CBUF_BLEND_ENABLE           (1<<15)
+#define S6_CBUF_BLEND_FUNC_SHIFT       12
+#define S6_CBUF_BLEND_FUNC_MASK        (0x7<<12)
+#define S6_CBUF_SRC_BLEND_FACT_SHIFT   8
+#define S6_CBUF_SRC_BLEND_FACT_MASK    (0xf<<8)
+#define S6_CBUF_DST_BLEND_FACT_SHIFT   4
+#define S6_CBUF_DST_BLEND_FACT_MASK    (0xf<<4)
+#define S6_DEPTH_WRITE_ENABLE          (1<<3)
+#define S6_COLOR_WRITE_ENABLE          (1<<2)
+#define S6_TRISTRIP_PV_SHIFT           0
+#define S6_TRISTRIP_PV_MASK            (0x3<<0)
+
+#define S7_DEPTH_OFFSET_CONST_MASK     ~0
+
+
+
+#define DST_BLND_FACT(f) ((f)<<S6_CBUF_DST_BLEND_FACT_SHIFT)
+#define SRC_BLND_FACT(f) ((f)<<S6_CBUF_SRC_BLEND_FACT_SHIFT)
+#define DST_ABLND_FACT(f) ((f)<<IAB_DST_FACTOR_SHIFT)
+#define SRC_ABLND_FACT(f) ((f)<<IAB_SRC_FACTOR_SHIFT)
+
+
+
+
+/* 3DSTATE_MAP_DEINTERLACER_PARAMETERS */
+
+/* 3DSTATE_MAP_PALETTE_LOAD_32, p206 */
+#define _3DSTATE_MAP_PALETTE_LOAD_32    (CMD_3D|(0x1d<<24)|(0x8f<<16))
+/* subsequent dwords up to length (max 16) are ARGB8888 color values */
+
+/* _3DSTATE_MODES_4, p218 */
+#define _3DSTATE_MODES_4_CMD		(CMD_3D|(0x0d<<24))
+#define ENABLE_LOGIC_OP_FUNC		(1<<23)
+#define LOGIC_OP_FUNC(x)		((x)<<18)
+#define LOGICOP_MASK			(0xf<<18)
+#define MODE4_ENABLE_STENCIL_TEST_MASK	((1<<17)|(0xff00))
+#define ENABLE_STENCIL_TEST_MASK	(1<<17)
+#define STENCIL_TEST_MASK(x)		(((x)&0xff)<<8)
+#define MODE4_ENABLE_STENCIL_WRITE_MASK	((1<<16)|(0x00ff))
+#define ENABLE_STENCIL_WRITE_MASK	(1<<16)
+#define STENCIL_WRITE_MASK(x)		((x)&0xff)
+
+/* _3DSTATE_MODES_5, p220 */
+#define _3DSTATE_MODES_5_CMD		(CMD_3D|(0x0c<<24))
+#define PIPELINE_FLUSH_RENDER_CACHE	(1<<18)
+#define PIPELINE_FLUSH_TEXTURE_CACHE	(1<<16)
+
+
+/* p221 */
+#define _3DSTATE_PIXEL_SHADER_CONSTANTS  (CMD_3D|(0x1d<<24)|(0x6<<16))
+#define PS1_REG(n)                      (1<<(n))
+#define PS2_CONST_X(n)                  (n)
+#define PS3_CONST_Y(n)                  (n)
+#define PS4_CONST_Z(n)                  (n)
+#define PS5_CONST_W(n)                  (n)
+
+/* p222 */
+
+
+#define I915_MAX_TEX_INDIRECT 4
+#define I915_MAX_TEX_INSN     32
+#define I915_MAX_ALU_INSN     64
+#define I915_MAX_DECL_INSN    27
+#define I915_MAX_TEMPORARY    16
+
+
+/* Each instruction is 3 dwords long, though most don't require all
+ * this space.  Maximum of 123 instructions.  Smaller maxes per insn
+ * type.
+ */
+#define _3DSTATE_PIXEL_SHADER_PROGRAM    (CMD_3D|(0x1d<<24)|(0x5<<16))
+
+#define REG_TYPE_R                 0    /* temporary regs, no need to
+                                         * dcl, must be written before
+                                         * read -- Preserved between
+                                         * phases. 
+                                         */
+#define REG_TYPE_T                 1    /* Interpolated values, must be
+                                         * dcl'ed before use.
+                                         *
+                                         * 0..7: texture coord,
+                                         * 8: diffuse spec,
+                                         * 9: specular color,
+                                         * 10: fog parameter in w.
+                                         */
+#define REG_TYPE_CONST             2    /* Restriction: only one const
+                                         * can be referenced per
+                                         * instruction, though it may be
+                                         * selected for multiple inputs.
+                                         * Constants not initialized
+                                         * default to zero.
+                                         */
+#define REG_TYPE_S                 3    /* sampler */
+#define REG_TYPE_OC                4    /* output color (rgba) */
+#define REG_TYPE_OD                5    /* output depth (w), xyz are
+                                         * temporaries.  If not written,
+                                         * interpolated depth is used?
+                                         */
+#define REG_TYPE_U                 6    /* unpreserved temporaries */
+#define REG_TYPE_MASK              0x7
+#define REG_NR_MASK                0xf
+
+
+/* REG_TYPE_T:
+ */
+#define T_TEX0     0
+#define T_TEX1     1
+#define T_TEX2     2
+#define T_TEX3     3
+#define T_TEX4     4
+#define T_TEX5     5
+#define T_TEX6     6
+#define T_TEX7     7
+#define T_DIFFUSE  8
+#define T_SPECULAR 9
+#define T_FOG_W    10           /* interpolated fog is in W coord */
+
+/* Arithmetic instructions */
+
+/* .replicate_swizzle == selection and replication of a particular
+ * scalar channel, ie., .xxxx, .yyyy, .zzzz or .wwww 
+ */
+#define A0_NOP    (0x0<<24)     /* no operation */
+#define A0_ADD    (0x1<<24)     /* dst = src0 + src1 */
+#define A0_MOV    (0x2<<24)     /* dst = src0 */
+#define A0_MUL    (0x3<<24)     /* dst = src0 * src1 */
+#define A0_MAD    (0x4<<24)     /* dst = src0 * src1 + src2 */
+#define A0_DP2ADD (0x5<<24)     /* dst.xyzw = src0.xy dot src1.xy + src2.replicate_swizzle */
+#define A0_DP3    (0x6<<24)     /* dst.xyzw = src0.xyz dot src1.xyz */
+#define A0_DP4    (0x7<<24)     /* dst.xyzw = src0.xyzw dot src1.xyzw */
+#define A0_FRC    (0x8<<24)     /* dst = src0 - floor(src0) */
+#define A0_RCP    (0x9<<24)     /* dst.xyzw = 1/(src0.replicate_swizzle) */
+#define A0_RSQ    (0xa<<24)     /* dst.xyzw = 1/(sqrt(abs(src0.replicate_swizzle))) */
+#define A0_EXP    (0xb<<24)     /* dst.xyzw = exp2(src0.replicate_swizzle) */
+#define A0_LOG    (0xc<<24)     /* dst.xyzw = log2(abs(src0.replicate_swizzle)) */
+#define A0_CMP    (0xd<<24)     /* dst = (src0 >= 0.0) ? src1 : src2 */
+#define A0_MIN    (0xe<<24)     /* dst = (src0 < src1) ? src0 : src1 */
+#define A0_MAX    (0xf<<24)     /* dst = (src0 >= src1) ? src0 : src1 */
+#define A0_FLR    (0x10<<24)    /* dst = floor(src0) */
+#define A0_MOD    (0x11<<24)    /* dst = src0 fmod 1.0 */
+#define A0_TRC    (0x12<<24)    /* dst = int(src0) */
+#define A0_SGE    (0x13<<24)    /* dst = src0 >= src1 ? 1.0 : 0.0 */
+#define A0_SLT    (0x14<<24)    /* dst = src0 < src1 ? 1.0 : 0.0 */
+#define A0_DEST_SATURATE                 (1<<22)
+#define A0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+#define A0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define A0_DEST_CHANNEL_X                (1<<10)
+#define A0_DEST_CHANNEL_Y                (2<<10)
+#define A0_DEST_CHANNEL_Z                (4<<10)
+#define A0_DEST_CHANNEL_W                (8<<10)
+#define A0_DEST_CHANNEL_ALL              (0xf<<10)
+#define A0_DEST_CHANNEL_SHIFT            10
+#define A0_SRC0_TYPE_SHIFT               7
+#define A0_SRC0_NR_SHIFT                 2
+
+#define A0_DEST_CHANNEL_XY              (A0_DEST_CHANNEL_X|A0_DEST_CHANNEL_Y)
+#define A0_DEST_CHANNEL_XYZ             (A0_DEST_CHANNEL_XY|A0_DEST_CHANNEL_Z)
+
+
+#define SRC_X        0
+#define SRC_Y        1
+#define SRC_Z        2
+#define SRC_W        3
+#define SRC_ZERO     4
+#define SRC_ONE      5
+
+#define A1_SRC0_CHANNEL_X_NEGATE         (1<<31)
+#define A1_SRC0_CHANNEL_X_SHIFT          28
+#define A1_SRC0_CHANNEL_Y_NEGATE         (1<<27)
+#define A1_SRC0_CHANNEL_Y_SHIFT          24
+#define A1_SRC0_CHANNEL_Z_NEGATE         (1<<23)
+#define A1_SRC0_CHANNEL_Z_SHIFT          20
+#define A1_SRC0_CHANNEL_W_NEGATE         (1<<19)
+#define A1_SRC0_CHANNEL_W_SHIFT          16
+#define A1_SRC1_TYPE_SHIFT               13
+#define A1_SRC1_NR_SHIFT                 8
+#define A1_SRC1_CHANNEL_X_NEGATE         (1<<7)
+#define A1_SRC1_CHANNEL_X_SHIFT          4
+#define A1_SRC1_CHANNEL_Y_NEGATE         (1<<3)
+#define A1_SRC1_CHANNEL_Y_SHIFT          0
+
+#define A2_SRC1_CHANNEL_Z_NEGATE         (1<<31)
+#define A2_SRC1_CHANNEL_Z_SHIFT          28
+#define A2_SRC1_CHANNEL_W_NEGATE         (1<<27)
+#define A2_SRC1_CHANNEL_W_SHIFT          24
+#define A2_SRC2_TYPE_SHIFT               21
+#define A2_SRC2_NR_SHIFT                 16
+#define A2_SRC2_CHANNEL_X_NEGATE         (1<<15)
+#define A2_SRC2_CHANNEL_X_SHIFT          12
+#define A2_SRC2_CHANNEL_Y_NEGATE         (1<<11)
+#define A2_SRC2_CHANNEL_Y_SHIFT          8
+#define A2_SRC2_CHANNEL_Z_NEGATE         (1<<7)
+#define A2_SRC2_CHANNEL_Z_SHIFT          4
+#define A2_SRC2_CHANNEL_W_NEGATE         (1<<3)
+#define A2_SRC2_CHANNEL_W_SHIFT          0
+
+
+
+/* Texture instructions */
+#define T0_TEXLD     (0x15<<24) /* Sample texture using predeclared
+                                 * sampler and address, and output
+                                 * filtered texel data to destination
+                                 * register */
+#define T0_TEXLDP    (0x16<<24) /* Same as texld but performs a
+                                 * perspective divide of the texture
+                                 * coordinate .xyz values by .w before
+                                 * sampling. */
+#define T0_TEXLDB    (0x17<<24) /* Same as texld but biases the
+                                 * computed LOD by w.  Only S4.6 two's
+                                 * comp is used.  This implies that a
+                                 * float to fixed conversion is
+                                 * done. */
+#define T0_TEXKILL   (0x18<<24) /* Does not perform a sampling
+                                 * operation.  Simply kills the pixel
+                                 * if any channel of the address
+                                 * register is < 0.0. */
+#define T0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+/* Note: U (unpreserved) regs do not retain their values between
+ * phases (cannot be used for feedback) 
+ *
+ * Note: oC and OD registers can only be used as the destination of a
+ * texture instruction once per phase (this is an implementation
+ * restriction). 
+ */
+#define T0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define T0_SAMPLER_NR_SHIFT              0      /* This field ignored for TEXKILL */
+#define T0_SAMPLER_NR_MASK               (0xf<<0)
+
+#define T1_ADDRESS_REG_TYPE_SHIFT        24     /* Reg to use as texture coord */
+/* Allow R, T, OC, OD -- R, OC, OD are 'dependent' reads, new program phase */
+#define T1_ADDRESS_REG_NR_SHIFT          17
+#define T2_MBZ                           0
+
+/* Declaration instructions */
+#define D0_DCL       (0x19<<24) /* Declare a t (interpolated attrib)
+                                 * register or an s (sampler)
+                                 * register. */
+#define D0_SAMPLE_TYPE_SHIFT              22
+#define D0_SAMPLE_TYPE_2D                 (0x0<<22)
+#define D0_SAMPLE_TYPE_CUBE               (0x1<<22)
+#define D0_SAMPLE_TYPE_VOLUME             (0x2<<22)
+#define D0_SAMPLE_TYPE_MASK               (0x3<<22)
+
+#define D0_TYPE_SHIFT                19
+/* Allow: T, S */
+#define D0_NR_SHIFT                  14
+/* Allow T: 0..10, S: 0..15 */
+#define D0_CHANNEL_X                (1<<10)
+#define D0_CHANNEL_Y                (2<<10)
+#define D0_CHANNEL_Z                (4<<10)
+#define D0_CHANNEL_W                (8<<10)
+#define D0_CHANNEL_ALL              (0xf<<10)
+#define D0_CHANNEL_NONE             (0<<10)
+
+#define D0_CHANNEL_XY               (D0_CHANNEL_X|D0_CHANNEL_Y)
+#define D0_CHANNEL_XYZ              (D0_CHANNEL_XY|D0_CHANNEL_Z)
+
+/* I915 Errata: Do not allow (xz), (xw), (xzw) combinations for diffuse
+ * or specular declarations. 
+ *
+ * For T dcls, only allow: (x), (xy), (xyz), (w), (xyzw) 
+ *
+ * Must be zero for S (sampler) dcls
+ */
+#define D1_MBZ                          0
+#define D2_MBZ                          0
+
+
+
+/* p207 */
+#define _3DSTATE_MAP_STATE               (CMD_3D|(0x1d<<24)|(0x0<<16))
+
+#define MS1_MAPMASK_SHIFT               0
+#define MS1_MAPMASK_MASK                (0x8fff<<0)
+
+#define MS2_UNTRUSTED_SURFACE           (1<<31)
+#define MS2_ADDRESS_MASK                0xfffffffc
+#define MS2_VERTICAL_LINE_STRIDE        (1<<1)
+#define MS2_VERTICAL_OFFSET             (1<<1)
+
+#define MS3_HEIGHT_SHIFT              21
+#define MS3_WIDTH_SHIFT               10
+#define MS3_PALETTE_SELECT            (1<<9)
+#define MS3_MAPSURF_FORMAT_SHIFT      7
+#define MS3_MAPSURF_FORMAT_MASK       (0x7<<7)
+#define    MAPSURF_8BIT		 	   (1<<7)
+#define    MAPSURF_16BIT		   (2<<7)
+#define    MAPSURF_32BIT		   (3<<7)
+#define    MAPSURF_422			   (5<<7)
+#define    MAPSURF_COMPRESSED		   (6<<7)
+#define    MAPSURF_4BIT_INDEXED		   (7<<7)
+#define MS3_MT_FORMAT_MASK         (0x7 << 3)
+#define MS3_MT_FORMAT_SHIFT        3
+#define    MT_4BIT_IDX_ARGB8888	           (7<<3)       /* SURFACE_4BIT_INDEXED */
+#define    MT_8BIT_I8		           (0<<3)       /* SURFACE_8BIT */
+#define    MT_8BIT_L8		           (1<<3)
+#define    MT_8BIT_A8		           (4<<3)
+#define    MT_8BIT_MONO8	           (5<<3)
+#define    MT_16BIT_RGB565 		   (0<<3)       /* SURFACE_16BIT */
+#define    MT_16BIT_ARGB1555		   (1<<3)
+#define    MT_16BIT_ARGB4444		   (2<<3)
+#define    MT_16BIT_AY88		   (3<<3)
+#define    MT_16BIT_88DVDU	           (5<<3)
+#define    MT_16BIT_BUMP_655LDVDU	   (6<<3)
+#define    MT_16BIT_I16	                   (7<<3)
+#define    MT_16BIT_L16	                   (8<<3)
+#define    MT_16BIT_A16	                   (9<<3)
+#define    MT_32BIT_ARGB8888		   (0<<3)       /* SURFACE_32BIT */
+#define    MT_32BIT_ABGR8888		   (1<<3)
+#define    MT_32BIT_XRGB8888		   (2<<3)
+#define    MT_32BIT_XBGR8888		   (3<<3)
+#define    MT_32BIT_QWVU8888		   (4<<3)
+#define    MT_32BIT_AXVU8888		   (5<<3)
+#define    MT_32BIT_LXVU8888	           (6<<3)
+#define    MT_32BIT_XLVU8888	           (7<<3)
+#define    MT_32BIT_ARGB2101010	           (8<<3)
+#define    MT_32BIT_ABGR2101010	           (9<<3)
+#define    MT_32BIT_AWVU2101010	           (0xA<<3)
+#define    MT_32BIT_GR1616	           (0xB<<3)
+#define    MT_32BIT_VU1616	           (0xC<<3)
+#define    MT_32BIT_xI824	           (0xD<<3)
+#define    MT_32BIT_xA824	           (0xE<<3)
+#define    MT_32BIT_xL824	           (0xF<<3)
+#define    MT_422_YCRCB_SWAPY	           (0<<3)       /* SURFACE_422 */
+#define    MT_422_YCRCB_NORMAL	           (1<<3)
+#define    MT_422_YCRCB_SWAPUV	           (2<<3)
+#define    MT_422_YCRCB_SWAPUVY	           (3<<3)
+#define    MT_COMPRESS_DXT1		   (0<<3)       /* SURFACE_COMPRESSED */
+#define    MT_COMPRESS_DXT2_3	           (1<<3)
+#define    MT_COMPRESS_DXT4_5	           (2<<3)
+#define    MT_COMPRESS_FXT1		   (3<<3)
+#define    MT_COMPRESS_DXT1_RGB		   (4<<3)
+#define MS3_USE_FENCE_REGS              (1<<2)
+#define MS3_TILED_SURFACE             (1<<1)
+#define MS3_TILE_WALK                 (1<<0)
+
+#define MS4_PITCH_SHIFT                 21
+#define MS4_CUBE_FACE_ENA_NEGX          (1<<20)
+#define MS4_CUBE_FACE_ENA_POSX          (1<<19)
+#define MS4_CUBE_FACE_ENA_NEGY          (1<<18)
+#define MS4_CUBE_FACE_ENA_POSY          (1<<17)
+#define MS4_CUBE_FACE_ENA_NEGZ          (1<<16)
+#define MS4_CUBE_FACE_ENA_POSZ          (1<<15)
+#define MS4_CUBE_FACE_ENA_MASK          (0x3f<<15)
+#define MS4_MAX_LOD_SHIFT		9
+#define MS4_MAX_LOD_MASK		(0x3f<<9)
+#define MS4_MIP_LAYOUT_LEGACY           (0<<8)
+#define MS4_MIP_LAYOUT_BELOW_LPT        (0<<8)
+#define MS4_MIP_LAYOUT_RIGHT_LPT        (1<<8)
+#define MS4_VOLUME_DEPTH_SHIFT          0
+#define MS4_VOLUME_DEPTH_MASK           (0xff<<0)
+
+/* p244 */
+#define _3DSTATE_SAMPLER_STATE         (CMD_3D|(0x1d<<24)|(0x1<<16))
+
+#define SS1_MAPMASK_SHIFT               0
+#define SS1_MAPMASK_MASK                (0x8fff<<0)
+
+#define SS2_REVERSE_GAMMA_ENABLE        (1<<31)
+#define SS2_PACKED_TO_PLANAR_ENABLE     (1<<30)
+#define SS2_COLORSPACE_CONVERSION       (1<<29)
+#define SS2_CHROMAKEY_SHIFT             27
+#define SS2_BASE_MIP_LEVEL_SHIFT        22
+#define SS2_BASE_MIP_LEVEL_MASK         (0x1f<<22)
+#define SS2_MIP_FILTER_SHIFT            20
+#define SS2_MIP_FILTER_MASK             (0x3<<20)
+#define   MIPFILTER_NONE       	0
+#define   MIPFILTER_NEAREST	1
+#define   MIPFILTER_LINEAR	3
+#define SS2_MAG_FILTER_SHIFT          17
+#define SS2_MAG_FILTER_MASK           (0x7<<17)
+#define   FILTER_NEAREST	0
+#define   FILTER_LINEAR		1
+#define   FILTER_ANISOTROPIC	2
+#define   FILTER_4X4_1    	3
+#define   FILTER_4X4_2    	4
+#define   FILTER_4X4_FLAT 	5
+#define   FILTER_6X5_MONO   	6       /* XXX - check */
+#define SS2_MIN_FILTER_SHIFT          14
+#define SS2_MIN_FILTER_MASK           (0x7<<14)
+#define SS2_LOD_BIAS_SHIFT            5
+#define SS2_LOD_BIAS_ONE              (0x10<<5)
+#define SS2_LOD_BIAS_MASK             (0x1ff<<5)
+/* Shadow requires:
+ *  MT_X8{I,L,A}24 or MT_{I,L,A}16 texture format
+ *  FILTER_4X4_x  MIN and MAG filters
+ */
+#define SS2_SHADOW_ENABLE             (1<<4)
+#define SS2_MAX_ANISO_MASK            (1<<3)
+#define SS2_MAX_ANISO_2               (0<<3)
+#define SS2_MAX_ANISO_4               (1<<3)
+#define SS2_SHADOW_FUNC_SHIFT         0
+#define SS2_SHADOW_FUNC_MASK          (0x7<<0)
+/* SS2_SHADOW_FUNC values: see COMPAREFUNC_* */
+
+#define SS3_MIN_LOD_SHIFT            24
+#define SS3_MIN_LOD_ONE              (0x10<<24)
+#define SS3_MIN_LOD_MASK             (0xff<<24)
+#define SS3_KILL_PIXEL_ENABLE        (1<<17)
+#define SS3_TCX_ADDR_MODE_SHIFT      12
+#define SS3_TCX_ADDR_MODE_MASK       (0x7<<12)
+#define   TEXCOORDMODE_WRAP		0
+#define   TEXCOORDMODE_MIRROR		1
+#define   TEXCOORDMODE_CLAMP_EDGE	2
+#define   TEXCOORDMODE_CUBE       	3
+#define   TEXCOORDMODE_CLAMP_BORDER	4
+#define   TEXCOORDMODE_MIRROR_ONCE      5
+#define SS3_TCY_ADDR_MODE_SHIFT      9
+#define SS3_TCY_ADDR_MODE_MASK       (0x7<<9)
+#define SS3_TCZ_ADDR_MODE_SHIFT      6
+#define SS3_TCZ_ADDR_MODE_MASK       (0x7<<6)
+#define SS3_NORMALIZED_COORDS        (1<<5)
+#define SS3_TEXTUREMAP_INDEX_SHIFT   1
+#define SS3_TEXTUREMAP_INDEX_MASK    (0xf<<1)
+#define SS3_DEINTERLACER_ENABLE      (1<<0)
+
+#define SS4_BORDER_COLOR_MASK        (~0)
+
+/* 3DSTATE_SPAN_STIPPLE, p258
+ */
+#define _3DSTATE_STIPPLE           ((0x3<<29)|(0x1d<<24)|(0x83<<16))
+#define ST1_ENABLE               (1<<16)
+#define ST1_MASK                 (0xffff)
+
+#define _3DSTATE_DEFAULT_Z          ((0x3<<29)|(0x1d<<24)|(0x98<<16))
+#define _3DSTATE_DEFAULT_DIFFUSE    ((0x3<<29)|(0x1d<<24)|(0x99<<16))
+#define _3DSTATE_DEFAULT_SPECULAR   ((0x3<<29)|(0x1d<<24)|(0x9a<<16))
+
+
+#define MI_FLUSH                   ((0<<29)|(4<<23))
+#define FLUSH_MAP_CACHE            (1<<0)
+#define INHIBIT_FLUSH_RENDER_CACHE (1<<2)
+
+
+#define CMD_3D (0x3<<29)
+
+
+#define _3DPRIMITIVE         ((0x3<<29)|(0x1f<<24))
+#define PRIM_INDIRECT            (1<<23)
+#define PRIM_INLINE              (0<<23)
+#define PRIM_INDIRECT_SEQUENTIAL (0<<17)
+#define PRIM_INDIRECT_ELTS       (1<<17)
+
+#define PRIM3D_TRILIST		(0x0<<18)
+#define PRIM3D_TRISTRIP 	(0x1<<18)
+#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
+#define PRIM3D_TRIFAN		(0x3<<18)
+#define PRIM3D_POLY		(0x4<<18)
+#define PRIM3D_LINELIST 	(0x5<<18)
+#define PRIM3D_LINESTRIP	(0x6<<18)
+#define PRIM3D_RECTLIST 	(0x7<<18)
+#define PRIM3D_POINTLIST	(0x8<<18)
+#define PRIM3D_DIB		(0x9<<18)
+#define PRIM3D_MASK		(0x1f<<18)
+
+#define I915PACKCOLOR4444(r,g,b,a) \
+  ((((a) & 0xf0) << 8) | (((r) & 0xf0) << 4) | ((g) & 0xf0) | ((b) >> 4))
+
+#define I915PACKCOLOR1555(r,g,b,a) \
+  ((((r) & 0xf8) << 7) | (((g) & 0xf8) << 2) | (((b) & 0xf8) >> 3) | \
+    ((a) ? 0x8000 : 0))
+
+#define I915PACKCOLOR565(r,g,b) \
+  ((((r) & 0xf8) << 8) | (((g) & 0xfc) << 3) | (((b) & 0xf8) >> 3))
+
+#define I915PACKCOLOR8888(r,g,b,a) \
+  ((a<<24) | (r<<16) | (g<<8) | b)
+
+
+
+
+#define BR00_BITBLT_CLIENT   0x40000000
+#define BR00_OP_COLOR_BLT    0x10000000
+#define BR00_OP_SRC_COPY_BLT 0x10C00000
+#define BR13_SOLID_PATTERN   0x80000000
+
+#define XY_COLOR_BLT_CMD		((2<<29)|(0x50<<22)|0x4)
+#define XY_COLOR_BLT_WRITE_ALPHA	(1<<21)
+#define XY_COLOR_BLT_WRITE_RGB		(1<<20)
+
+#define XY_SRC_COPY_BLT_CMD             ((2<<29)|(0x53<<22)|6)
+#define XY_SRC_COPY_BLT_WRITE_ALPHA     (1<<21)
+#define XY_SRC_COPY_BLT_WRITE_RGB       (1<<20)
+
+#define MI_WAIT_FOR_EVENT               ((0x3<<23))
+#define MI_WAIT_FOR_PLANE_B_FLIP        (1<<6)
+#define MI_WAIT_FOR_PLANE_A_FLIP        (1<<2)
+
+#define MI_BATCH_BUFFER                 (0x30<<23)
+#define MI_BATCH_BUFFER_START           (0x31<<23)
+#define MI_BATCH_BUFFER_END             (0xa<<23)
+
+
+
+#define COMPAREFUNC_ALWAYS		0
+#define COMPAREFUNC_NEVER		0x1
+#define COMPAREFUNC_LESS		0x2
+#define COMPAREFUNC_EQUAL		0x3
+#define COMPAREFUNC_LEQUAL		0x4
+#define COMPAREFUNC_GREATER		0x5
+#define COMPAREFUNC_NOTEQUAL		0x6
+#define COMPAREFUNC_GEQUAL		0x7
+
+#define STENCILOP_KEEP			0
+#define STENCILOP_ZERO			0x1
+#define STENCILOP_REPLACE		0x2
+#define STENCILOP_INCRSAT		0x3
+#define STENCILOP_DECRSAT		0x4
+#define STENCILOP_INCR			0x5
+#define STENCILOP_DECR			0x6
+#define STENCILOP_INVERT		0x7
+
+#define LOGICOP_CLEAR			0
+#define LOGICOP_NOR			0x1
+#define LOGICOP_AND_INV 		0x2
+#define LOGICOP_COPY_INV		0x3
+#define LOGICOP_AND_RVRSE		0x4
+#define LOGICOP_INV			0x5
+#define LOGICOP_XOR			0x6
+#define LOGICOP_NAND			0x7
+#define LOGICOP_AND			0x8
+#define LOGICOP_EQUIV			0x9
+#define LOGICOP_NOOP			0xa
+#define LOGICOP_OR_INV			0xb
+#define LOGICOP_COPY			0xc
+#define LOGICOP_OR_RVRSE		0xd
+#define LOGICOP_OR			0xe
+#define LOGICOP_SET			0xf
+
+#define BLENDFACT_ZERO			0x01
+#define BLENDFACT_ONE			0x02
+#define BLENDFACT_SRC_COLR		0x03
+#define BLENDFACT_INV_SRC_COLR 		0x04
+#define BLENDFACT_SRC_ALPHA		0x05
+#define BLENDFACT_INV_SRC_ALPHA 	0x06
+#define BLENDFACT_DST_ALPHA		0x07
+#define BLENDFACT_INV_DST_ALPHA 	0x08
+#define BLENDFACT_DST_COLR		0x09
+#define BLENDFACT_INV_DST_COLR		0x0a
+#define BLENDFACT_SRC_ALPHA_SATURATE	0x0b
+#define BLENDFACT_CONST_COLOR		0x0c
+#define BLENDFACT_INV_CONST_COLOR	0x0d
+#define BLENDFACT_CONST_ALPHA		0x0e
+#define BLENDFACT_INV_CONST_ALPHA	0x0f
+#define BLENDFACT_MASK          	0x0f
+
+#define PCI_CHIP_I915_G			0x2582
+#define PCI_CHIP_I915_GM		0x2592
+#define PCI_CHIP_I945_G			0x2772
+#define PCI_CHIP_I945_GM		0x27A2
+#define PCI_CHIP_I945_GME		0x27AE
+#define PCI_CHIP_G33_G			0x29C2
+#define PCI_CHIP_Q35_G			0x29B2
+#define PCI_CHIP_Q33_G			0x29D2
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
new file mode 100644
index 0000000000..39e48105b3
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -0,0 +1,288 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_simple_screen.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_string.h"
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_screen.h"
+#include "i915_texture.h"
+
+
+static const char *
+i915_get_vendor( struct pipe_screen *pscreen )
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+i915_get_name( struct pipe_screen *pscreen )
+{
+   static char buffer[128];
+   const char *chipset;
+
+   switch (i915_screen(pscreen)->pci_id) {
+   case PCI_CHIP_I915_G:
+      chipset = "915G";
+      break;
+   case PCI_CHIP_I915_GM:
+      chipset = "915GM";
+      break;
+   case PCI_CHIP_I945_G:
+      chipset = "945G";
+      break;
+   case PCI_CHIP_I945_GM:
+      chipset = "945GM";
+      break;
+   case PCI_CHIP_I945_GME:
+      chipset = "945GME";
+      break;
+   case PCI_CHIP_G33_G:
+      chipset = "G33";
+      break;
+   case PCI_CHIP_Q35_G:
+      chipset = "Q35";
+      break;
+   case PCI_CHIP_Q33_G:
+      chipset = "Q33";
+      break;
+   default:
+      chipset = "unknown";
+      break;
+   }
+
+   util_snprintf(buffer, sizeof(buffer), "i915 (chipset: %s)", chipset);
+   return buffer;
+}
+
+
+static int
+i915_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 8;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 0;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 0;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 11; /* max 1024x1024 */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 11; /* max 1024x1024 */
+   default:
+      return 0;
+   }
+}
+
+
+static float
+i915_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.5;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 4.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   default:
+      return 0;
+   }
+}
+
+
+static boolean
+i915_is_format_supported( struct pipe_screen *screen,
+                          enum pipe_format format, 
+                          enum pipe_texture_target target,
+                          unsigned tex_usage, 
+                          unsigned geom_flags )
+{
+   static const enum pipe_format tex_supported[] = {
+      PIPE_FORMAT_R8G8B8A8_UNORM,
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+      PIPE_FORMAT_L8_UNORM,
+      PIPE_FORMAT_A8_UNORM,
+      PIPE_FORMAT_I8_UNORM,
+      PIPE_FORMAT_A8L8_UNORM,
+      PIPE_FORMAT_YCBCR,
+      PIPE_FORMAT_YCBCR_REV,
+      PIPE_FORMAT_S8Z24_UNORM,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   static const enum pipe_format surface_supported[] = {
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+      PIPE_FORMAT_S8Z24_UNORM,
+      /*PIPE_FORMAT_R16G16B16A16_SNORM,*/
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   const enum pipe_format *list;
+   uint i;
+
+   if(tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET)
+      list = surface_supported;
+   else
+      list = tex_supported;
+
+   for (i = 0; list[i] != PIPE_FORMAT_NONE; i++) {
+      if (list[i] == format)
+         return TRUE;
+   }
+
+   return FALSE;
+}
+
+
+static void
+i915_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+static void *
+i915_surface_map( struct pipe_screen *screen,
+                  struct pipe_surface *surface,
+                  unsigned flags )
+{
+   struct i915_texture *tex = (struct i915_texture *)surface->texture;
+   char *map = pipe_buffer_map( screen, tex->buffer, flags );
+   if (map == NULL)
+      return NULL;
+
+   if (surface->texture &&
+       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
+   {
+      /* Do something to notify contexts of a texture change.  
+       */
+      /* i915_screen(screen)->timestamp++; */
+   }
+   
+   return map + surface->offset;
+}
+
+static void
+i915_surface_unmap(struct pipe_screen *screen,
+                   struct pipe_surface *surface)
+{
+   struct i915_texture *tex = (struct i915_texture *)surface->texture;
+   pipe_buffer_unmap( screen, tex->buffer );
+}
+
+
+
+/**
+ * Create a new i915_screen object
+ */
+struct pipe_screen *
+i915_create_screen(struct pipe_winsys *winsys, uint pci_id)
+{
+   struct i915_screen *i915screen = CALLOC_STRUCT(i915_screen);
+
+   if (!i915screen)
+      return NULL;
+
+   switch (pci_id) {
+   case PCI_CHIP_I915_G:
+   case PCI_CHIP_I915_GM:
+      i915screen->is_i945 = FALSE;
+      break;
+
+   case PCI_CHIP_I945_G:
+   case PCI_CHIP_I945_GM:
+   case PCI_CHIP_I945_GME:
+   case PCI_CHIP_G33_G:
+   case PCI_CHIP_Q33_G:
+   case PCI_CHIP_Q35_G:
+      i915screen->is_i945 = TRUE;
+      break;
+
+   default:
+      debug_printf("%s: unknown pci id 0x%x, cannot create screen\n", 
+                   __FUNCTION__, pci_id);
+      return NULL;
+   }
+
+   i915screen->pci_id = pci_id;
+
+   i915screen->screen.winsys = winsys;
+
+   i915screen->screen.destroy = i915_destroy_screen;
+
+   i915screen->screen.get_name = i915_get_name;
+   i915screen->screen.get_vendor = i915_get_vendor;
+   i915screen->screen.get_param = i915_get_param;
+   i915screen->screen.get_paramf = i915_get_paramf;
+   i915screen->screen.is_format_supported = i915_is_format_supported;
+   i915screen->screen.surface_map = i915_surface_map;
+   i915screen->screen.surface_unmap = i915_surface_unmap;
+
+   i915_init_screen_texture_functions(&i915screen->screen);
+   u_simple_screen_init(&i915screen->screen);
+
+   return &i915screen->screen;
+}
diff --git a/src/gallium/drivers/i915simple/i915_screen.h b/src/gallium/drivers/i915simple/i915_screen.h
new file mode 100644
index 0000000000..73b0ff05ce
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_screen.h
@@ -0,0 +1,69 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef I915_SCREEN_H
+#define I915_SCREEN_H
+
+
+#include "pipe/p_screen.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct i915_screen
+{
+   struct pipe_screen screen;
+
+   boolean is_i945;
+   uint pci_id;
+};
+
+
+/** cast wrapper */
+static INLINE struct i915_screen *
+i915_screen(struct pipe_screen *pscreen)
+{
+   return (struct i915_screen *) pscreen;
+}
+
+
+extern struct pipe_screen *
+i915_create_screen(struct pipe_winsys *winsys, uint pci_id);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* I915_SCREEN_H */
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
new file mode 100644
index 0000000000..273e74002a
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -0,0 +1,788 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "draw/draw_context.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+#include "i915_state_inlines.h"
+#include "i915_fpc.h"
+
+/* The i915 (and related graphics cores) do not support GL_CLAMP.  The
+ * Intel drivers for "other operating systems" implement GL_CLAMP as
+ * GL_CLAMP_TO_EDGE, so the same is done here.
+ */
+static unsigned
+translate_wrap_mode(unsigned wrap)
+{
+   switch (wrap) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return TEXCOORDMODE_WRAP;
+   case PIPE_TEX_WRAP_CLAMP:
+      return TEXCOORDMODE_CLAMP_EDGE;   /* not quite correct */
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return TEXCOORDMODE_CLAMP_EDGE;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return TEXCOORDMODE_CLAMP_BORDER;
+//   case PIPE_TEX_WRAP_MIRRORED_REPEAT:
+//      return TEXCOORDMODE_MIRROR;
+   default:
+      return TEXCOORDMODE_WRAP;
+   }
+}
+
+static unsigned translate_img_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      return FILTER_NEAREST;
+   case PIPE_TEX_FILTER_LINEAR:
+      return FILTER_LINEAR;
+   case PIPE_TEX_FILTER_ANISO:
+      return FILTER_ANISOTROPIC;
+   default:
+      assert(0);
+      return FILTER_NEAREST;
+   }
+}
+
+static unsigned translate_mip_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NONE:
+      return MIPFILTER_NONE;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      return MIPFILTER_NEAREST;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      return MIPFILTER_LINEAR;
+   default:
+      assert(0);
+      return MIPFILTER_NONE;
+   }
+}
+
+
+/* None of this state is actually used for anything yet.
+ */
+static void *
+i915_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{
+   struct i915_blend_state *cso_data = CALLOC_STRUCT( i915_blend_state );
+
+   {
+      unsigned eqRGB  = blend->rgb_func;
+      unsigned srcRGB = blend->rgb_src_factor;
+      unsigned dstRGB = blend->rgb_dst_factor;
+
+      unsigned eqA    = blend->alpha_func;
+      unsigned srcA   = blend->alpha_src_factor;
+      unsigned dstA   = blend->alpha_dst_factor;
+
+      /* Special handling for MIN/MAX filter modes handled at
+       * state_tracker level.
+       */
+
+      if (srcA != srcRGB ||
+	  dstA != dstRGB ||
+	  eqA != eqRGB) {
+
+	 cso_data->iab = (_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
+                          IAB_MODIFY_ENABLE |
+                          IAB_ENABLE |
+                          IAB_MODIFY_FUNC |
+                          IAB_MODIFY_SRC_FACTOR |
+                          IAB_MODIFY_DST_FACTOR |
+                          SRC_ABLND_FACT(i915_translate_blend_factor(srcA)) |
+                          DST_ABLND_FACT(i915_translate_blend_factor(dstA)) |
+                          (i915_translate_blend_func(eqA) << IAB_FUNC_SHIFT));
+      }
+      else {
+	 cso_data->iab = (_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
+                          IAB_MODIFY_ENABLE |
+                          0);
+      }
+   }
+
+   cso_data->modes4 |= (_3DSTATE_MODES_4_CMD |
+                        ENABLE_LOGIC_OP_FUNC |
+                        LOGIC_OP_FUNC(i915_translate_logic_op(blend->logicop_func)));
+
+   if (blend->logicop_enable)
+      cso_data->LIS5 |= S5_LOGICOP_ENABLE;
+
+   if (blend->dither)
+      cso_data->LIS5 |= S5_COLOR_DITHER_ENABLE;
+
+   if ((blend->colormask & PIPE_MASK_R) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_RED;
+
+   if ((blend->colormask & PIPE_MASK_G) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_GREEN;
+
+   if ((blend->colormask & PIPE_MASK_B) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_BLUE;
+
+   if ((blend->colormask & PIPE_MASK_A) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_ALPHA;
+
+   if (blend->blend_enable) {
+      unsigned funcRGB = blend->rgb_func;
+      unsigned srcRGB  = blend->rgb_src_factor;
+      unsigned dstRGB  = blend->rgb_dst_factor;
+
+      cso_data->LIS6 |= (S6_CBUF_BLEND_ENABLE |
+                         SRC_BLND_FACT(i915_translate_blend_factor(srcRGB)) |
+                         DST_BLND_FACT(i915_translate_blend_factor(dstRGB)) |
+                         (i915_translate_blend_func(funcRGB) << S6_CBUF_BLEND_FUNC_SHIFT));
+   }
+
+   return cso_data;
+}
+
+static void i915_bind_blend_state(struct pipe_context *pipe,
+                                  void *blend)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->blend = (struct i915_blend_state*)blend;
+
+   i915->dirty |= I915_NEW_BLEND;
+}
+
+
+static void i915_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+static void i915_set_blend_color( struct pipe_context *pipe,
+			     const struct pipe_blend_color *blend_color )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->blend_color = *blend_color;
+
+   i915->dirty |= I915_NEW_BLEND;
+}
+
+static void *
+i915_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   struct i915_sampler_state *cso = CALLOC_STRUCT( i915_sampler_state );
+   const unsigned ws = sampler->wrap_s;
+   const unsigned wt = sampler->wrap_t;
+   const unsigned wr = sampler->wrap_r;
+   unsigned minFilt, magFilt;
+   unsigned mipFilt;
+
+   cso->templ = sampler;
+
+   mipFilt = translate_mip_filter(sampler->min_mip_filter);
+   minFilt = translate_img_filter( sampler->min_img_filter );
+   magFilt = translate_img_filter( sampler->mag_img_filter );
+   
+   if (sampler->max_anisotropy > 2.0) {
+      cso->state[0] |= SS2_MAX_ANISO_4;
+   }
+
+   {
+      int b = (int) (sampler->lod_bias * 16.0);
+      b = CLAMP(b, -256, 255);
+      cso->state[0] |= ((b << SS2_LOD_BIAS_SHIFT) & SS2_LOD_BIAS_MASK);
+   }
+
+   /* Shadow:
+    */
+   if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) 
+   {
+      cso->state[0] |= (SS2_SHADOW_ENABLE |
+                        i915_translate_compare_func(sampler->compare_func));
+
+      minFilt = FILTER_4X4_FLAT;
+      magFilt = FILTER_4X4_FLAT;
+   }
+
+   cso->state[0] |= ((minFilt << SS2_MIN_FILTER_SHIFT) |
+                     (mipFilt << SS2_MIP_FILTER_SHIFT) |
+                     (magFilt << SS2_MAG_FILTER_SHIFT));
+
+   cso->state[1] |=
+      ((translate_wrap_mode(ws) << SS3_TCX_ADDR_MODE_SHIFT) |
+       (translate_wrap_mode(wt) << SS3_TCY_ADDR_MODE_SHIFT) |
+       (translate_wrap_mode(wr) << SS3_TCZ_ADDR_MODE_SHIFT));
+
+   if (sampler->normalized_coords)
+      cso->state[1] |= SS3_NORMALIZED_COORDS;
+
+   {
+      int minlod = (int) (16.0 * sampler->min_lod);
+      int maxlod = (int) (16.0 * sampler->max_lod);
+      minlod = CLAMP(minlod, 0, 16 * 11);
+      maxlod = CLAMP(maxlod, 0, 16 * 11);
+
+      if (minlod > maxlod)
+	 maxlod = minlod;
+
+      cso->minlod = minlod;
+      cso->maxlod = maxlod;
+   }
+
+   {
+      ubyte r = float_to_ubyte(sampler->border_color[0]);
+      ubyte g = float_to_ubyte(sampler->border_color[1]);
+      ubyte b = float_to_ubyte(sampler->border_color[2]);
+      ubyte a = float_to_ubyte(sampler->border_color[3]);
+      cso->state[2] = I915PACKCOLOR8888(r, g, b, a);
+   }
+   return cso;
+}
+
+static void i915_bind_sampler_states(struct pipe_context *pipe,
+                                     unsigned num, void **sampler)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == i915->num_samplers &&
+       !memcmp(i915->sampler, sampler, num * sizeof(void *)))
+      return;
+
+   draw_flush(i915->draw);
+
+   for (i = 0; i < num; ++i)
+      i915->sampler[i] = sampler[i];
+   for (i = num; i < PIPE_MAX_SAMPLERS; ++i)
+      i915->sampler[i] = NULL;
+
+   i915->num_samplers = num;
+
+   i915->dirty |= I915_NEW_SAMPLER;
+}
+
+static void i915_delete_sampler_state(struct pipe_context *pipe,
+                                      void *sampler)
+{
+   FREE(sampler);
+}
+
+
+/** XXX move someday?  Or consolidate all these simple state setters
+ * into one file.
+ */
+
+static void *
+i915_create_depth_stencil_state(struct pipe_context *pipe,
+				const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   struct i915_depth_stencil_state *cso = CALLOC_STRUCT( i915_depth_stencil_state );
+
+   {
+      int testmask = depth_stencil->stencil[0].valuemask & 0xff;
+      int writemask = depth_stencil->stencil[0].writemask & 0xff;
+
+      cso->stencil_modes4 |= (_3DSTATE_MODES_4_CMD |
+                              ENABLE_STENCIL_TEST_MASK |
+                              STENCIL_TEST_MASK(testmask) |
+                              ENABLE_STENCIL_WRITE_MASK |
+                              STENCIL_WRITE_MASK(writemask));
+   }
+
+   if (depth_stencil->stencil[0].enabled) {
+      int test = i915_translate_compare_func(depth_stencil->stencil[0].func);
+      int fop  = i915_translate_stencil_op(depth_stencil->stencil[0].fail_op);
+      int dfop = i915_translate_stencil_op(depth_stencil->stencil[0].zfail_op);
+      int dpop = i915_translate_stencil_op(depth_stencil->stencil[0].zpass_op);
+      int ref  = depth_stencil->stencil[0].ref_value & 0xff;
+
+      cso->stencil_LIS5 |= (S5_STENCIL_TEST_ENABLE |
+                            S5_STENCIL_WRITE_ENABLE |
+                            (ref  << S5_STENCIL_REF_SHIFT) |
+                            (test << S5_STENCIL_TEST_FUNC_SHIFT) |
+                            (fop  << S5_STENCIL_FAIL_SHIFT) |
+                            (dfop << S5_STENCIL_PASS_Z_FAIL_SHIFT) |
+                            (dpop << S5_STENCIL_PASS_Z_PASS_SHIFT));
+   }
+
+   if (depth_stencil->stencil[1].enabled) {
+      int test  = i915_translate_compare_func(depth_stencil->stencil[1].func);
+      int fop   = i915_translate_stencil_op(depth_stencil->stencil[1].fail_op);
+      int dfop  = i915_translate_stencil_op(depth_stencil->stencil[1].zfail_op);
+      int dpop  = i915_translate_stencil_op(depth_stencil->stencil[1].zpass_op);
+      int ref   = depth_stencil->stencil[1].ref_value & 0xff;
+      int tmask = depth_stencil->stencil[1].valuemask & 0xff;
+      int wmask = depth_stencil->stencil[1].writemask & 0xff;
+
+      cso->bfo[0] = (_3DSTATE_BACKFACE_STENCIL_OPS |
+                     BFO_ENABLE_STENCIL_FUNCS |
+                     BFO_ENABLE_STENCIL_TWO_SIDE |
+                     BFO_ENABLE_STENCIL_REF |
+                     BFO_STENCIL_TWO_SIDE |
+                     (ref  << BFO_STENCIL_REF_SHIFT) |
+                     (test << BFO_STENCIL_TEST_SHIFT) |
+                     (fop  << BFO_STENCIL_FAIL_SHIFT) |
+                     (dfop << BFO_STENCIL_PASS_Z_FAIL_SHIFT) |
+                     (dpop << BFO_STENCIL_PASS_Z_PASS_SHIFT));
+
+      cso->bfo[1] = (_3DSTATE_BACKFACE_STENCIL_MASKS |
+                     BFM_ENABLE_STENCIL_TEST_MASK |
+                     BFM_ENABLE_STENCIL_WRITE_MASK |
+                     (tmask << BFM_STENCIL_TEST_MASK_SHIFT) |
+                     (wmask << BFM_STENCIL_WRITE_MASK_SHIFT));
+   }
+   else {
+      /* This actually disables two-side stencil: The bit set is a
+       * modify-enable bit to indicate we are changing the two-side
+       * setting.  Then there is a symbolic zero to show that we are
+       * setting the flag to zero/off.
+       */
+      cso->bfo[0] = (_3DSTATE_BACKFACE_STENCIL_OPS |
+                     BFO_ENABLE_STENCIL_TWO_SIDE |
+                     0);
+      cso->bfo[1] = 0;
+   }
+
+   if (depth_stencil->depth.enabled) {
+      int func = i915_translate_compare_func(depth_stencil->depth.func);
+
+      cso->depth_LIS6 |= (S6_DEPTH_TEST_ENABLE |
+                          (func << S6_DEPTH_TEST_FUNC_SHIFT));
+
+      if (depth_stencil->depth.writemask)
+	 cso->depth_LIS6 |= S6_DEPTH_WRITE_ENABLE;
+   }
+
+   if (depth_stencil->alpha.enabled) {
+      int test = i915_translate_compare_func(depth_stencil->alpha.func);
+      ubyte refByte = float_to_ubyte(depth_stencil->alpha.ref_value);
+
+      cso->depth_LIS6 |= (S6_ALPHA_TEST_ENABLE |
+			  (test << S6_ALPHA_TEST_FUNC_SHIFT) |
+			  (((unsigned) refByte) << S6_ALPHA_REF_SHIFT));
+   }
+
+   return cso;
+}
+
+static void i915_bind_depth_stencil_state(struct pipe_context *pipe,
+                                          void *depth_stencil)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->depth_stencil = (const struct i915_depth_stencil_state *)depth_stencil;
+
+   i915->dirty |= I915_NEW_DEPTH_STENCIL;
+}
+
+static void i915_delete_depth_stencil_state(struct pipe_context *pipe,
+                                            void *depth_stencil)
+{
+   FREE(depth_stencil);
+}
+
+
+static void i915_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   memcpy( &i915->scissor, scissor, sizeof(*scissor) );
+   i915->dirty |= I915_NEW_SCISSOR;
+}
+
+
+static void i915_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+}
+
+
+
+static void *
+i915_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   struct i915_fragment_shader *ifs = CALLOC_STRUCT(i915_fragment_shader);
+   if (!ifs)
+      return NULL;
+
+   ifs->state.tokens = tgsi_dup_tokens(templ->tokens);
+
+   tgsi_scan_shader(templ->tokens, &ifs->info);
+
+   /* The shader's compiled to i915 instructions here */
+   i915_translate_fragment_program(i915, ifs);
+
+   return ifs;
+}
+
+static void
+i915_bind_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->fs = (struct i915_fragment_shader*) shader;
+
+   i915->dirty |= I915_NEW_FS;
+}
+
+static
+void i915_delete_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_fragment_shader *ifs = (struct i915_fragment_shader *) shader;
+
+   if (ifs->program)
+      FREE(ifs->program);
+   ifs->program_len = 0;
+
+   FREE((struct tgsi_token *)ifs->state.tokens);
+
+   FREE(ifs);
+}
+
+
+static void *
+i915_create_vs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   /* just pass-through to draw module */
+   return draw_create_vertex_shader(i915->draw, templ);
+}
+
+static void i915_bind_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   /* just pass-through to draw module */
+   draw_bind_vertex_shader(i915->draw, (struct draw_vertex_shader *) shader);
+
+   i915->dirty |= I915_NEW_VS;
+}
+
+static void i915_delete_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   /* just pass-through to draw module */
+   draw_delete_vertex_shader(i915->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void i915_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     const struct pipe_constant_buffer *buf)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   struct pipe_winsys *ws = pipe->winsys;
+   draw_flush(i915->draw);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   /* Make a copy of shader constants.
+    * During fragment program translation we may add additional
+    * constants to the array.
+    *
+    * We want to consider the situation where some user constants
+    * (ex: a material color) may change frequently but the shader program
+    * stays the same.  In that case we should only be updating the first
+    * N constants, leaving any extras from shader translation alone.
+    */
+   if (buf) {
+      void *mapped;
+      if (buf->buffer && buf->buffer->size &&
+          (mapped = ws->buffer_map(ws, buf->buffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ))) {
+         memcpy(i915->current.constants[shader], mapped, buf->buffer->size);
+         ws->buffer_unmap(ws, buf->buffer);
+         i915->current.num_user_constants[shader]
+            = buf->buffer->size / (4 * sizeof(float));
+      }
+      else {
+         i915->current.num_user_constants[shader] = 0;
+      }
+   }
+
+   i915->dirty |= I915_NEW_CONSTANTS;
+}
+
+
+static void i915_set_sampler_textures(struct pipe_context *pipe,
+                                      unsigned num,
+                                      struct pipe_texture **texture)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == i915->num_textures &&
+       !memcmp(i915->texture, texture, num * sizeof(struct pipe_texture *)))
+      return;
+
+   /* Fixes wrong texture in texobj with VBUF */
+   draw_flush(i915->draw);
+
+   for (i = 0; i < num; i++)
+      pipe_texture_reference((struct pipe_texture **) &i915->texture[i],
+                             texture[i]);
+
+   for (i = num; i < i915->num_textures; i++)
+      pipe_texture_reference((struct pipe_texture **) &i915->texture[i],
+                             NULL);
+
+   i915->num_textures = num;
+
+   i915->dirty |= I915_NEW_TEXTURE;
+}
+
+
+
+static void i915_set_framebuffer_state(struct pipe_context *pipe,
+				       const struct pipe_framebuffer_state *fb)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->framebuffer = *fb; /* struct copy */
+
+   i915->dirty |= I915_NEW_FRAMEBUFFER;
+}
+
+
+
+static void i915_set_clip_state( struct pipe_context *pipe,
+			     const struct pipe_clip_state *clip )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   draw_set_clip_state(i915->draw, clip);
+
+   i915->dirty |= I915_NEW_CLIP;
+}
+
+
+
+/* Called when driver state tracker notices changes to the viewport
+ * matrix:
+ */
+static void i915_set_viewport_state( struct pipe_context *pipe,
+				     const struct pipe_viewport_state *viewport )
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   i915->viewport = *viewport; /* struct copy */
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(i915->draw, &i915->viewport);
+
+   i915->dirty |= I915_NEW_VIEWPORT;
+}
+
+
+static void *
+i915_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   struct i915_rasterizer_state *cso = CALLOC_STRUCT( i915_rasterizer_state );
+
+   cso->templ = rasterizer;
+   cso->color_interp = rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+   cso->light_twoside = rasterizer->light_twoside;
+   cso->ds[0].u = _3DSTATE_DEPTH_OFFSET_SCALE;
+   cso->ds[1].f = rasterizer->offset_scale;
+   if (rasterizer->poly_stipple_enable) {
+      cso->st |= ST1_ENABLE;
+   }
+
+   if (rasterizer->scissor)
+      cso->sc[0] = _3DSTATE_SCISSOR_ENABLE_CMD | ENABLE_SCISSOR_RECT;
+   else
+      cso->sc[0] = _3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT;
+
+   switch (rasterizer->cull_mode) {
+   case PIPE_WINDING_NONE:
+      cso->LIS4 |= S4_CULLMODE_NONE;
+      break;
+   case PIPE_WINDING_CW:
+      cso->LIS4 |= S4_CULLMODE_CW;
+      break;
+   case PIPE_WINDING_CCW:
+      cso->LIS4 |= S4_CULLMODE_CCW;
+      break;
+   case PIPE_WINDING_BOTH:
+      cso->LIS4 |= S4_CULLMODE_BOTH;
+      break;
+   }
+
+   {
+      int line_width = CLAMP((int)(rasterizer->line_width * 2), 1, 0xf);
+
+      cso->LIS4 |= line_width << S4_LINE_WIDTH_SHIFT;
+
+      if (rasterizer->line_smooth)
+	 cso->LIS4 |= S4_LINE_ANTIALIAS_ENABLE;
+   }
+
+   {
+      int point_size = CLAMP((int) rasterizer->point_size, 1, 0xff);
+
+      cso->LIS4 |= point_size << S4_POINT_WIDTH_SHIFT;
+   }
+
+   if (rasterizer->flatshade) {
+      cso->LIS4 |= (S4_FLATSHADE_ALPHA |
+                    S4_FLATSHADE_COLOR |
+                    S4_FLATSHADE_SPECULAR);
+   }
+
+   cso->LIS7 = fui( rasterizer->offset_units );
+
+
+   return cso;
+}
+
+static void i915_bind_rasterizer_state( struct pipe_context *pipe,
+                                        void *raster )
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   i915->rasterizer = (struct i915_rasterizer_state *)raster;
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(i915->draw,
+                          (i915->rasterizer ? i915->rasterizer->templ : NULL));
+
+   i915->dirty |= I915_NEW_RASTERIZER;
+}
+
+static void i915_delete_rasterizer_state(struct pipe_context *pipe,
+                                         void *raster)
+{
+   FREE(raster);
+}
+
+static void i915_set_vertex_buffers(struct pipe_context *pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_buffer *buffers)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   /* Because we change state before the draw_set_vertex_buffers call
+    * we need a flush here, just to be sure.
+    */
+   draw_flush(i915->draw);
+
+   memcpy(i915->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   i915->num_vertex_buffers = count;
+
+   /* pass-through to draw module */
+   draw_set_vertex_buffers(i915->draw, count, buffers);
+}
+
+static void i915_set_vertex_elements(struct pipe_context *pipe,
+                                     unsigned count,
+                                     const struct pipe_vertex_element *elements)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   /* Because we change state before the draw_set_vertex_buffers call
+    * we need a flush here, just to be sure.
+    */
+   draw_flush(i915->draw);
+
+   i915->num_vertex_elements = count;
+   /* pass-through to draw module */
+   draw_set_vertex_elements(i915->draw, count, elements);
+}
+
+
+static void i915_set_edgeflags(struct pipe_context *pipe,
+                               const unsigned *bitfield)
+{
+   /* TODO do something here */
+}
+
+void
+i915_init_state_functions( struct i915_context *i915 )
+{
+   i915->pipe.set_edgeflags = i915_set_edgeflags;
+   i915->pipe.create_blend_state = i915_create_blend_state;
+   i915->pipe.bind_blend_state = i915_bind_blend_state;
+   i915->pipe.delete_blend_state = i915_delete_blend_state;
+
+   i915->pipe.create_sampler_state = i915_create_sampler_state;
+   i915->pipe.bind_sampler_states = i915_bind_sampler_states;
+   i915->pipe.delete_sampler_state = i915_delete_sampler_state;
+
+   i915->pipe.create_depth_stencil_alpha_state = i915_create_depth_stencil_state;
+   i915->pipe.bind_depth_stencil_alpha_state = i915_bind_depth_stencil_state;
+   i915->pipe.delete_depth_stencil_alpha_state = i915_delete_depth_stencil_state;
+
+   i915->pipe.create_rasterizer_state = i915_create_rasterizer_state;
+   i915->pipe.bind_rasterizer_state = i915_bind_rasterizer_state;
+   i915->pipe.delete_rasterizer_state = i915_delete_rasterizer_state;
+   i915->pipe.create_fs_state = i915_create_fs_state;
+   i915->pipe.bind_fs_state = i915_bind_fs_state;
+   i915->pipe.delete_fs_state = i915_delete_fs_state;
+   i915->pipe.create_vs_state = i915_create_vs_state;
+   i915->pipe.bind_vs_state = i915_bind_vs_state;
+   i915->pipe.delete_vs_state = i915_delete_vs_state;
+
+   i915->pipe.set_blend_color = i915_set_blend_color;
+   i915->pipe.set_clip_state = i915_set_clip_state;
+   i915->pipe.set_constant_buffer = i915_set_constant_buffer;
+   i915->pipe.set_framebuffer_state = i915_set_framebuffer_state;
+
+   i915->pipe.set_polygon_stipple = i915_set_polygon_stipple;
+   i915->pipe.set_scissor_state = i915_set_scissor_state;
+   i915->pipe.set_sampler_textures = i915_set_sampler_textures;
+   i915->pipe.set_viewport_state = i915_set_viewport_state;
+   i915->pipe.set_vertex_buffers = i915_set_vertex_buffers;
+   i915->pipe.set_vertex_elements = i915_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/i915simple/i915_state.h b/src/gallium/drivers/i915simple/i915_state.h
new file mode 100644
index 0000000000..86c6b0027d
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef I915_STATE_H
+#define I915_STATE_H
+
+struct i915_context;
+
+
+struct i915_tracked_state {
+   unsigned dirty;
+   void (*update)( struct i915_context * );
+};
+
+void i915_update_immediate( struct i915_context *i915 );
+void i915_update_dynamic( struct i915_context *i915 );
+void i915_update_derived( struct i915_context *i915 );
+void i915_update_samplers( struct i915_context *i915 );
+void i915_update_textures(struct i915_context *i915);
+
+void i915_emit_hardware_state( struct i915_context *i915 );
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c
new file mode 100644
index 0000000000..178d4e8781
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_derived.c
@@ -0,0 +1,183 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "i915_context.h"
+#include "i915_state.h"
+#include "i915_reg.h"
+#include "i915_fpc.h"
+
+
+
+/**
+ * Determine the hardware vertex layout.
+ * Depends on vertex/fragment shader state.
+ */
+static void calculate_vertex_layout( struct i915_context *i915 )
+{
+   const struct i915_fragment_shader *fs = i915->fs;
+   const enum interp_mode colorInterp = i915->rasterizer->color_interp;
+   struct vertex_info vinfo;
+   boolean texCoords[8], colors[2], fog, needW;
+   uint i;
+   int src;
+
+   memset(texCoords, 0, sizeof(texCoords));
+   colors[0] = colors[1] = fog = needW = FALSE;
+   memset(&vinfo, 0, sizeof(vinfo));
+
+   /* Determine which fragment program inputs are needed.  Setup HW vertex
+    * layout below, in the HW-specific attribute order.
+    */
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      switch (fs->info.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         assert(fs->info.input_semantic_index[i] < 2);
+         colors[fs->info.input_semantic_index[i]] = TRUE;
+         break;
+      case TGSI_SEMANTIC_GENERIC:
+         /* usually a texcoord */
+         {
+            const uint unit = fs->info.input_semantic_index[i];
+            assert(unit < 8);
+            texCoords[unit] = TRUE;
+            needW = TRUE;
+         }
+         break;
+      case TGSI_SEMANTIC_FOG:
+         fog = TRUE;
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   
+   /* pos */
+   src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
+   if (needW) {
+      draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
+      vinfo.hwfmt[0] |= S4_VFMT_XYZW;
+      vinfo.attrib[0].emit = EMIT_4F;
+   }
+   else {
+      draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
+      vinfo.hwfmt[0] |= S4_VFMT_XYZ;
+      vinfo.attrib[0].emit = EMIT_3F;
+   }
+
+   /* hardware point size */
+   /* XXX todo */
+
+   /* primary color */
+   if (colors[0]) {
+      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB, colorInterp, src);
+      vinfo.hwfmt[0] |= S4_VFMT_COLOR;
+   }
+
+   /* secondary color */
+   if (colors[1]) {
+      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB, colorInterp, src);
+      vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG;
+   }
+
+   /* fog coord, not fog blend factor */
+   if (fog) {
+      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+      vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM;
+   }
+
+   /* texcoords */
+   for (i = 0; i < 8; i++) {
+      uint hwtc;
+      if (texCoords[i]) {
+         hwtc = TEXCOORDFMT_4D;
+         src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_GENERIC, i);
+         draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+      }
+      else {
+         hwtc = TEXCOORDFMT_NOT_PRESENT;
+      }
+      vinfo.hwfmt[1] |= hwtc << (i * 4);
+   }
+
+   draw_compute_vertex_size(&vinfo);
+
+   if (memcmp(&i915->current.vertex_info, &vinfo, sizeof(vinfo))) {
+      /* Need to set this flag so that the LIS2/4 registers get set.
+       * It also means the i915_update_immediate() function must be called
+       * after this one, in i915_update_derived().
+       */
+      i915->dirty |= I915_NEW_VERTEX_FORMAT;
+
+      memcpy(&i915->current.vertex_info, &vinfo, sizeof(vinfo));
+   }
+}
+
+
+
+
+/* Hopefully this will remain quite simple, otherwise need to pull in
+ * something like the state tracker mechanism.
+ */
+void i915_update_derived( struct i915_context *i915 )
+{
+   if (i915->dirty & (I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS))
+      calculate_vertex_layout( i915 );
+
+   if (i915->dirty & (I915_NEW_SAMPLER | I915_NEW_TEXTURE))
+      i915_update_samplers(i915);
+
+   if (i915->dirty & I915_NEW_TEXTURE)
+      i915_update_textures(i915);
+
+   if (i915->dirty)
+      i915_update_immediate( i915 );
+
+   if (i915->dirty)
+      i915_update_dynamic( i915 );
+
+   if (i915->dirty & I915_NEW_FS) {
+      i915->hardware_dirty |= I915_HW_PROGRAM; /* XXX right? */
+   }
+
+   /* HW emit currently references framebuffer state directly:
+    */
+   if (i915->dirty & I915_NEW_FRAMEBUFFER)
+      i915->hardware_dirty |= I915_HW_STATIC;
+
+   i915->dirty = 0;
+}
diff --git a/src/gallium/drivers/i915simple/i915_state_dynamic.c b/src/gallium/drivers/i915simple/i915_state_dynamic.c
new file mode 100644
index 0000000000..86126a5a15
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_dynamic.c
@@ -0,0 +1,310 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_batch.h"
+#include "i915_state_inlines.h"
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+
+#define FILE_DEBUG_FLAG DEBUG_STATE
+
+/* State that we have chosen to store in the DYNAMIC segment of the
+ * i915 indirect state mechanism.  
+ *
+ * Can't cache these in the way we do the static state, as there is no
+ * start/size in the command packet, instead an 'end' value that gets
+ * incremented.
+ *
+ * Additionally, there seems to be a requirement to re-issue the full
+ * (active) state every time a 4kb boundary is crossed.
+ */
+
+static INLINE void set_dynamic_indirect( struct i915_context *i915,
+					 unsigned offset,
+					 const unsigned *src,
+					 unsigned dwords )
+{
+   unsigned i;
+
+   for (i = 0; i < dwords; i++)
+      i915->current.dynamic[offset + i] = src[i];
+
+   i915->hardware_dirty |= I915_HW_DYNAMIC;
+}
+
+
+/***********************************************************************
+ * Modes4: stencil masks and logicop 
+ */
+static void upload_MODES4( struct i915_context *i915 )
+{
+   unsigned modes4 = 0;
+
+   /* I915_NEW_STENCIL */
+   modes4 |= i915->depth_stencil->stencil_modes4;
+   /* I915_NEW_BLEND */
+   modes4 |= i915->blend->modes4;
+
+   /* Always, so that we know when state is in-active: 
+    */
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_MODES4,
+			 &modes4,
+			 1 );
+}
+
+const struct i915_tracked_state i915_upload_MODES4 = {
+   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL,
+   upload_MODES4
+};
+
+
+
+
+/***********************************************************************
+ */
+
+static void upload_BFO( struct i915_context *i915 )
+{
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_BFO_0,
+			 &(i915->depth_stencil->bfo[0]),
+			 2 );
+}
+
+const struct i915_tracked_state i915_upload_BFO = {
+   I915_NEW_DEPTH_STENCIL,
+   upload_BFO
+};
+
+
+/***********************************************************************
+ */
+
+
+static void upload_BLENDCOLOR( struct i915_context *i915 )
+{
+   unsigned bc[2];
+
+   memset( bc, 0, sizeof(bc) );
+
+   /* I915_NEW_BLEND {_COLOR} 
+    */
+   {
+      const float *color = i915->blend_color.color;
+
+      bc[0] = _3DSTATE_CONST_BLEND_COLOR_CMD;
+      bc[1] = pack_ui32_float4( color[0],
+				color[1],
+				color[2], 
+				color[3] );
+   }
+
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_BC_0,
+			 bc,
+			 2 );
+}
+
+const struct i915_tracked_state i915_upload_BLENDCOLOR = {
+   I915_NEW_BLEND,
+   upload_BLENDCOLOR
+};
+
+/***********************************************************************
+ */
+
+
+static void upload_IAB( struct i915_context *i915 )
+{
+   unsigned iab = i915->blend->iab;
+
+
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_IAB,
+			 &iab,
+			 1 );
+}
+
+const struct i915_tracked_state i915_upload_IAB = {
+   I915_NEW_BLEND,
+   upload_IAB
+};
+
+
+/***********************************************************************
+ */
+
+
+
+static void upload_DEPTHSCALE( struct i915_context *i915 )
+{
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_DEPTHSCALE_0,
+			 &(i915->rasterizer->ds[0].u),
+			 2 );
+}
+
+const struct i915_tracked_state i915_upload_DEPTHSCALE = {
+   I915_NEW_RASTERIZER,
+   upload_DEPTHSCALE
+};
+
+
+
+/***********************************************************************
+ * Polygon stipple
+ *
+ * The i915 supports a 4x4 stipple natively, GL wants 32x32.
+ * Fortunately stipple is usually a repeating pattern.
+ *
+ * XXX: does stipple pattern need to be adjusted according to
+ * the window position?
+ *
+ * XXX: possibly need workaround for conform paths test. 
+ */
+
+static void upload_STIPPLE( struct i915_context *i915 )
+{
+   unsigned st[2];
+
+   st[0] = _3DSTATE_STIPPLE;
+   st[1] = 0;
+
+   /* I915_NEW_RASTERIZER
+    */
+   st[1] |= i915->rasterizer->st;
+
+
+   /* I915_NEW_STIPPLE
+    */
+   {
+      const ubyte *mask = (const ubyte *)i915->poly_stipple.stipple;
+      ubyte p[4];
+
+      p[0] = mask[12] & 0xf;
+      p[1] = mask[8] & 0xf;
+      p[2] = mask[4] & 0xf;
+      p[3] = mask[0] & 0xf;
+
+      /* Not sure what to do about fallbacks, so for now just dont:
+       */
+      st[1] |= ((p[0] << 0) |
+		(p[1] << 4) |
+		(p[2] << 8) | 
+		(p[3] << 12));
+   }
+
+
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_STP_0,
+			 &st[0],
+			 2 );
+}
+
+
+const struct i915_tracked_state i915_upload_STIPPLE = {
+   I915_NEW_RASTERIZER | I915_NEW_STIPPLE,
+   upload_STIPPLE
+};
+
+
+
+/***********************************************************************
+ * Scissor.
+ */
+static void upload_SCISSOR_ENABLE( struct i915_context *i915 )
+{
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_SC_ENA_0,
+			 &(i915->rasterizer->sc[0]),
+			 1 );
+}
+
+const struct i915_tracked_state i915_upload_SCISSOR_ENABLE = {
+   I915_NEW_RASTERIZER,
+   upload_SCISSOR_ENABLE
+};
+
+
+
+static void upload_SCISSOR_RECT( struct i915_context *i915 )
+{
+   unsigned x1 = i915->scissor.minx;
+   unsigned y1 = i915->scissor.miny;
+   unsigned x2 = i915->scissor.maxx;
+   unsigned y2 = i915->scissor.maxy;
+   unsigned sc[3];
+ 
+   sc[0] = _3DSTATE_SCISSOR_RECT_0_CMD;
+   sc[1] = (y1 << 16) | (x1 & 0xffff);
+   sc[2] = (y2 << 16) | (x2 & 0xffff);
+
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_SC_RECT_0,
+			 &sc[0],
+			 3 );
+}
+
+
+const struct i915_tracked_state i915_upload_SCISSOR_RECT = {
+   I915_NEW_SCISSOR,
+   upload_SCISSOR_RECT
+};
+
+
+
+
+
+
+static const struct i915_tracked_state *atoms[] = {
+   &i915_upload_MODES4,
+   &i915_upload_BFO,
+   &i915_upload_BLENDCOLOR,
+   &i915_upload_IAB,
+   &i915_upload_DEPTHSCALE,
+   &i915_upload_STIPPLE,
+   &i915_upload_SCISSOR_ENABLE,
+   &i915_upload_SCISSOR_RECT
+};
+
+/* These will be dynamic indirect state commands, but for now just end
+ * up on the batch buffer with everything else.
+ */
+void i915_update_dynamic( struct i915_context *i915 )
+{
+   int i;
+
+   for (i = 0; i < Elements(atoms); i++)
+      if (i915->dirty & atoms[i]->dirty)
+	 atoms[i]->update( i915 );
+}
+
diff --git a/src/gallium/drivers/i915simple/i915_state_emit.c b/src/gallium/drivers/i915simple/i915_state_emit.c
new file mode 100644
index 0000000000..6558cf1c3e
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_emit.c
@@ -0,0 +1,410 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_batch.h"
+#include "i915_reg.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+
+static unsigned translate_format( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return COLOR_BUF_ARGB8888;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return COLOR_BUF_RGB565;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned translate_depth_format( enum pipe_format zformat )
+{
+   switch (zformat) {
+   case PIPE_FORMAT_S8Z24_UNORM:
+      return DEPTH_FRMT_24_FIXED_8_OTHER;
+   case PIPE_FORMAT_Z16_UNORM:
+      return DEPTH_FRMT_16_FIXED;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+
+/**
+ * Examine framebuffer state to determine width, height.
+ */
+static boolean
+framebuffer_size(const struct pipe_framebuffer_state *fb,
+                 uint *width, uint *height)
+{
+   if (fb->cbufs[0]) {
+      *width = fb->cbufs[0]->width;
+      *height = fb->cbufs[0]->height;
+      return TRUE;
+   }
+   else if (fb->zsbuf) {
+      *width = fb->zsbuf->width;
+      *height = fb->zsbuf->height;
+      return TRUE;
+   }
+   else {
+      *width = *height = 0;
+      return FALSE;
+   }
+}
+
+
+/* Push the state into the sarea and/or texture memory.
+ */
+void
+i915_emit_hardware_state(struct i915_context *i915 )
+{
+   /* XXX: there must be an easier way */
+   const unsigned dwords = ( 14 + 
+                             7 + 
+                             I915_MAX_DYNAMIC + 
+                             8 + 
+                             2 + I915_TEX_UNITS*3 + 
+                             2 + I915_TEX_UNITS*3 +
+                             2 + I915_MAX_CONSTANT*4 + 
+#if 0
+                             i915->current.program_len + 
+#else
+                             i915->fs->program_len + 
+#endif
+                             6 
+                           ) * 3/2; /* plus 50% margin */
+   const unsigned relocs = ( I915_TEX_UNITS +
+	                     3
+                           ) * 3/2; /* plus 50% margin */
+
+#if 0
+   debug_printf("i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
+#endif
+   
+   if(!BEGIN_BATCH(dwords, relocs)) {
+      FLUSH_BATCH(NULL);
+      assert(BEGIN_BATCH(dwords, relocs));
+   }
+
+   /* 14 dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_INVARIENT)
+   {
+      OUT_BATCH(_3DSTATE_AA_CMD |
+		AA_LINE_ECAAR_WIDTH_ENABLE |
+		AA_LINE_ECAAR_WIDTH_1_0 |
+		AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
+
+      OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
+      OUT_BATCH(0);
+
+      OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD);
+      OUT_BATCH(0);
+      
+      OUT_BATCH(_3DSTATE_DFLT_Z_CMD);
+      OUT_BATCH(0);
+
+      OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
+		CSB_TCB(0, 0) |
+		CSB_TCB(1, 1) |
+		CSB_TCB(2, 2) |
+		CSB_TCB(3, 3) |
+		CSB_TCB(4, 4) | 
+		CSB_TCB(5, 5) | 
+		CSB_TCB(6, 6) | 
+		CSB_TCB(7, 7));
+
+      OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
+		ENABLE_POINT_RASTER_RULE |
+		OGL_POINT_RASTER_RULE |
+		ENABLE_LINE_STRIP_PROVOKE_VRTX |
+		ENABLE_TRI_FAN_PROVOKE_VRTX |
+		LINE_STRIP_PROVOKE_VRTX(1) |
+		TRI_FAN_PROVOKE_VRTX(2) | 
+		ENABLE_TEXKILL_3D_4D | 
+		TEXKILL_4D);
+
+      /* Need to initialize this to zero.
+       */
+      OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | (0));
+      OUT_BATCH(0);
+
+      OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE);
+
+      /* disable indirect state for now
+       */
+      OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);
+      OUT_BATCH(0);
+   }
+   
+   /* 7 dwords, 1 relocs */
+   if (i915->hardware_dirty & I915_HW_IMMEDIATE)
+   {
+      OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | 
+		I1_LOAD_S(0) |
+		I1_LOAD_S(1) |
+		I1_LOAD_S(2) |
+		I1_LOAD_S(4) |
+		I1_LOAD_S(5) |
+		I1_LOAD_S(6) | 
+		(5));
+      
+      if(i915->vbo)
+         OUT_RELOC(i915->vbo,
+                   I915_BUFFER_ACCESS_READ,
+                   i915->current.immediate[I915_IMMEDIATE_S0]);
+      else
+	 /* FIXME: we should not do this */
+	 OUT_BATCH(0);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S1]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S2]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S4]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S5]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S6]);
+   } 
+   
+   /* I915_MAX_DYNAMIC dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_DYNAMIC) 
+   {
+      int i;
+      for (i = 0; i < I915_MAX_DYNAMIC; i++) {
+	 OUT_BATCH(i915->current.dynamic[i]);
+      }
+   }
+   
+   /* 8 dwords, 2 relocs */
+   if (i915->hardware_dirty & I915_HW_STATIC)
+   {
+      struct pipe_surface *cbuf_surface = i915->framebuffer.cbufs[0];
+      struct pipe_surface *depth_surface = i915->framebuffer.zsbuf;
+
+      if (cbuf_surface) {
+	 unsigned cpitch = cbuf_surface->stride;
+	 unsigned ctile = BUF_3D_USE_FENCE;
+         struct i915_texture *tex = (struct i915_texture *)
+                                    cbuf_surface->texture;
+         struct pipe_buffer *buffer = tex->buffer;
+         assert(tex);
+
+	 if (tex && tex->tiled) {
+	    ctile = BUF_3D_TILED_SURFACE;
+	 }
+
+	 OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+
+	 OUT_BATCH(BUF_3D_ID_COLOR_BACK |
+		   BUF_3D_PITCH(cpitch) |  /* pitch in bytes */
+		   ctile);
+
+	 OUT_RELOC(tex->buffer,
+		   I915_BUFFER_ACCESS_WRITE,
+		   cbuf_surface->offset);
+      }
+
+      /* What happens if no zbuf??
+       */
+      if (depth_surface) {
+	 unsigned zpitch = depth_surface->stride;
+	 unsigned ztile = BUF_3D_USE_FENCE;
+         struct i915_texture *tex = (struct i915_texture *)
+                                    depth_surface->texture;
+         struct pipe_buffer *buffer = tex->buffer;
+         assert(tex);
+
+	 if (tex && tex->tiled) {
+	    ztile = BUF_3D_TILED_SURFACE;
+	 }
+
+	 OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+
+	 OUT_BATCH(BUF_3D_ID_DEPTH |
+		   BUF_3D_PITCH(zpitch) |  /* pitch in bytes */
+		   ztile);
+
+	 OUT_RELOC(tex->buffer,
+		   I915_BUFFER_ACCESS_WRITE,
+		   depth_surface->offset);
+      }
+   
+      {
+	 unsigned cformat, zformat = 0;
+      
+	 if (cbuf_surface)
+            cformat = cbuf_surface->format;
+         else
+            cformat = PIPE_FORMAT_A8R8G8B8_UNORM; /* arbitrary */
+         cformat = translate_format(cformat);
+
+	 if (depth_surface) 
+	    zformat = translate_depth_format( i915->framebuffer.zsbuf->format );
+
+	 OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
+	 OUT_BATCH(DSTORG_HORT_BIAS(0x8) | /* .5 */
+		   DSTORG_VERT_BIAS(0x8) | /* .5 */
+		   LOD_PRECLAMP_OGL |
+		   TEX_DEFAULT_COLOR_OGL |
+		   cformat |
+		   zformat );
+      }
+   }
+
+#if 01
+      /* texture images */
+      /* 2 + I915_TEX_UNITS*3 dwords, I915_TEX_UNITS relocs */
+      if (i915->hardware_dirty & (I915_HW_MAP | I915_HW_SAMPLER))
+      {
+         const uint nr = i915->current.sampler_enable_nr;
+         if (nr) {
+            const uint enabled = i915->current.sampler_enable_flags;
+            uint unit;
+            uint count = 0;
+            OUT_BATCH(_3DSTATE_MAP_STATE | (3 * nr));
+            OUT_BATCH(enabled);
+            for (unit = 0; unit < I915_TEX_UNITS; unit++) {
+               if (enabled & (1 << unit)) {
+                  struct pipe_buffer *buf =
+                     i915->texture[unit]->buffer;
+                  uint offset = 0;
+                  assert(buf);
+
+                  count++;
+
+                  OUT_RELOC(buf,
+                            I915_BUFFER_ACCESS_READ,
+                            offset);
+                  OUT_BATCH(i915->current.texbuffer[unit][0]); /* MS3 */
+                  OUT_BATCH(i915->current.texbuffer[unit][1]); /* MS4 */
+               }
+            }
+            assert(count == nr);
+         }
+      }
+#endif
+
+#if 01
+   /* samplers */
+   /* 2 + I915_TEX_UNITS*3 dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_SAMPLER) 
+   {
+      if (i915->current.sampler_enable_nr) {
+	 int i;
+	 
+	 OUT_BATCH( _3DSTATE_SAMPLER_STATE | 
+		    (3 * i915->current.sampler_enable_nr) );
+
+	 OUT_BATCH( i915->current.sampler_enable_flags );
+
+	 for (i = 0; i < I915_TEX_UNITS; i++) {
+	    if (i915->current.sampler_enable_flags & (1<<i)) {
+	       OUT_BATCH( i915->current.sampler[i][0] );
+	       OUT_BATCH( i915->current.sampler[i][1] );
+	       OUT_BATCH( i915->current.sampler[i][2] );
+	    }
+	 }
+      }
+   }
+#endif
+
+   /* constants */
+   /* 2 + I915_MAX_CONSTANT*4 dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_PROGRAM)
+   {
+      /* Collate the user-defined constants with the fragment shader's
+       * immediates according to the constant_flags[] array.
+       */
+      const uint nr = i915->fs->num_constants;
+      if (nr) {
+         uint i;
+
+         OUT_BATCH( _3DSTATE_PIXEL_SHADER_CONSTANTS | (nr * 4) );
+         OUT_BATCH( (1 << (nr - 1)) | ((1 << (nr - 1)) - 1) );
+
+         for (i = 0; i < nr; i++) {
+            const uint *c;
+            if (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER) {
+               /* grab user-defined constant */
+               c = (uint *) i915->current.constants[PIPE_SHADER_FRAGMENT][i];
+            }
+            else {
+               /* emit program constant */
+               c = (uint *) i915->fs->constants[i];
+            }
+#if 0 /* debug */
+            {
+               float *f = (float *) c;
+               printf("Const %2d: %f %f %f %f %s\n", i, f[0], f[1], f[2], f[3],
+                      (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER
+                       ? "user" : "immediate"));
+            }
+#endif
+            OUT_BATCH(*c++);
+            OUT_BATCH(*c++);
+            OUT_BATCH(*c++);
+            OUT_BATCH(*c++);
+         }
+      }
+   }
+
+   /* Fragment program */
+   /* i915->current.program_len dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_PROGRAM)
+   {
+      uint i;
+      /* we should always have, at least, a pass-through program */
+      assert(i915->fs->program_len > 0);
+      for (i = 0; i < i915->fs->program_len; i++) {
+         OUT_BATCH(i915->fs->program[i]);
+      }
+   }
+
+   /* drawing surface size */
+   /* 6 dwords, 0 relocs */
+   {
+      uint w, h;
+      boolean k = framebuffer_size(&i915->framebuffer, &w, &h);
+      (void)k;
+      assert(k);
+
+      OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(((w - 1) & 0xffff) | ((h - 1) << 16));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   }
+
+
+   i915->hardware_dirty = 0;
+}
diff --git a/src/gallium/drivers/i915simple/i915_state_immediate.c b/src/gallium/drivers/i915simple/i915_state_immediate.c
new file mode 100644
index 0000000000..8c16bb4e27
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_immediate.c
@@ -0,0 +1,225 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+#include "i915_state_inlines.h"
+#include "i915_context.h"
+#include "i915_state.h"
+#include "i915_reg.h"
+#include "util/u_memory.h"
+
+
+/* All state expressable with the LOAD_STATE_IMMEDIATE_1 packet.
+ * Would like to opportunistically recombine all these fragments into
+ * a single packet containing only what has changed, but for now emit
+ * as multiple packets.
+ */
+
+
+
+
+/***********************************************************************
+ * S0,S1: Vertex buffer state.  
+ */
+static void upload_S0S1(struct i915_context *i915)
+{
+   unsigned LIS0, LIS1;
+
+   /* INTEL_NEW_VBO */
+   /* TODO: re-use vertex buffers here? */
+   LIS0 = i915->vbo_offset;
+
+   /* INTEL_NEW_VERTEX_SIZE -- do this where the vertex size is calculated! 
+    */
+   {
+      unsigned vertex_size = i915->current.vertex_info.size;
+
+      LIS1 = ((vertex_size << 24) |
+	      (vertex_size << 16));
+   }
+
+   /* INTEL_NEW_VBO */
+   /* TODO: use a vertex generation number to track vbo changes */
+   if (1 ||
+       i915->current.immediate[I915_IMMEDIATE_S0] != LIS0 ||
+       i915->current.immediate[I915_IMMEDIATE_S1] != LIS1) 
+   {
+      i915->current.immediate[I915_IMMEDIATE_S0] = LIS0;
+      i915->current.immediate[I915_IMMEDIATE_S1] = LIS1;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S0S1 = {
+   I915_NEW_VBO | I915_NEW_VERTEX_FORMAT,
+   upload_S0S1
+};
+
+
+
+
+/***********************************************************************
+ * S4: Vertex format, rasterization state
+ */
+static void upload_S2S4(struct i915_context *i915)
+{
+   unsigned LIS2, LIS4;
+
+   /* I915_NEW_VERTEX_FORMAT */
+   {
+      LIS2 = i915->current.vertex_info.hwfmt[1];
+      LIS4 = i915->current.vertex_info.hwfmt[0];
+      /*
+      debug_printf("LIS2: 0x%x  LIS4: 0x%x\n", LIS2, LIS4);
+      */
+      assert(LIS4); /* should never be zero? */
+   }
+
+   LIS4 |= i915->rasterizer->LIS4;
+
+   if (LIS2 != i915->current.immediate[I915_IMMEDIATE_S2] ||
+       LIS4 != i915->current.immediate[I915_IMMEDIATE_S4]) {
+
+      i915->current.immediate[I915_IMMEDIATE_S2] = LIS2;
+      i915->current.immediate[I915_IMMEDIATE_S4] = LIS4;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+
+const struct i915_tracked_state i915_upload_S2S4 = {
+   I915_NEW_RASTERIZER | I915_NEW_VERTEX_FORMAT,
+   upload_S2S4
+};
+
+
+
+/***********************************************************************
+ * 
+ */
+static void upload_S5( struct i915_context *i915 )
+{
+   unsigned LIS5 = 0;
+
+   LIS5 |= i915->depth_stencil->stencil_LIS5;
+
+   LIS5 |= i915->blend->LIS5;
+
+#if 0
+   /* I915_NEW_RASTERIZER */
+   if (i915->state.Polygon->OffsetFill) {
+      LIS5 |= S5_GLOBAL_DEPTH_OFFSET_ENABLE;
+   }
+#endif
+
+
+   if (LIS5 != i915->current.immediate[I915_IMMEDIATE_S5]) {
+      i915->current.immediate[I915_IMMEDIATE_S5] = LIS5;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S5 = {
+   (I915_NEW_DEPTH_STENCIL | I915_NEW_BLEND | I915_NEW_RASTERIZER),
+   upload_S5
+};
+
+
+/***********************************************************************
+ */
+static void upload_S6( struct i915_context *i915 )
+{
+   unsigned LIS6 = (2 << S6_TRISTRIP_PV_SHIFT);
+
+   /* I915_NEW_FRAMEBUFFER
+    */
+   if (i915->framebuffer.cbufs[0])
+      LIS6 |= S6_COLOR_WRITE_ENABLE;
+
+   /* I915_NEW_BLEND
+    */
+   LIS6 |= i915->blend->LIS6;
+
+   /* I915_NEW_DEPTH
+    */
+   LIS6 |= i915->depth_stencil->depth_LIS6;
+
+   if (LIS6 != i915->current.immediate[I915_IMMEDIATE_S6]) {
+      i915->current.immediate[I915_IMMEDIATE_S6] = LIS6;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S6 = {
+   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL | I915_NEW_FRAMEBUFFER,
+   upload_S6
+};
+
+
+/***********************************************************************
+ */
+static void upload_S7( struct i915_context *i915 )
+{
+   unsigned LIS7;
+
+   /* I915_NEW_RASTERIZER
+    */
+   LIS7 = i915->rasterizer->LIS7;
+
+   if (LIS7 != i915->current.immediate[I915_IMMEDIATE_S7]) {
+      i915->current.immediate[I915_IMMEDIATE_S7] = LIS7;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S7 = {
+   I915_NEW_RASTERIZER,
+   upload_S7
+};
+
+
+static const struct i915_tracked_state *atoms[] = {
+   &i915_upload_S0S1,
+   &i915_upload_S2S4,
+   &i915_upload_S5,
+   &i915_upload_S6,
+   &i915_upload_S7
+};
+
+/* 
+ */
+void i915_update_immediate( struct i915_context *i915 )
+{
+   int i;
+
+   for (i = 0; i < Elements(atoms); i++)
+      if (i915->dirty & atoms[i]->dirty)
+	 atoms[i]->update( i915 );
+}
diff --git a/src/gallium/drivers/i915simple/i915_state_inlines.h b/src/gallium/drivers/i915simple/i915_state_inlines.h
new file mode 100644
index 0000000000..378de8f9c4
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_inlines.h
@@ -0,0 +1,230 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_STATE_INLINES_H
+#define I915_STATE_INLINES_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "i915_reg.h"
+
+
+static INLINE unsigned
+i915_translate_compare_func(unsigned func)
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      return COMPAREFUNC_NEVER;
+   case PIPE_FUNC_LESS:
+      return COMPAREFUNC_LESS;
+   case PIPE_FUNC_LEQUAL:
+      return COMPAREFUNC_LEQUAL;
+   case PIPE_FUNC_GREATER:
+      return COMPAREFUNC_GREATER;
+   case PIPE_FUNC_GEQUAL:
+      return COMPAREFUNC_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:
+      return COMPAREFUNC_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:
+      return COMPAREFUNC_EQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return COMPAREFUNC_ALWAYS;
+   default:
+      return COMPAREFUNC_ALWAYS;
+   }
+}
+
+static INLINE unsigned
+i915_translate_stencil_op(unsigned op)
+{
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:
+      return STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:
+      return STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:
+      return STENCILOP_INVERT;
+   default:
+      return STENCILOP_ZERO;
+   }
+}
+
+static INLINE unsigned
+i915_translate_blend_factor(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return BLENDFACT_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return BLENDFACT_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_ONE:
+      return BLENDFACT_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return BLENDFACT_SRC_COLR;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return BLENDFACT_INV_SRC_COLR;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return BLENDFACT_DST_COLR;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      return BLENDFACT_INV_DST_COLR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return BLENDFACT_INV_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return BLENDFACT_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return BLENDFACT_INV_DST_ALPHA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return BLENDFACT_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return BLENDFACT_CONST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return BLENDFACT_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return BLENDFACT_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return BLENDFACT_INV_CONST_ALPHA;
+   default:
+      return BLENDFACT_ZERO;
+   }
+}
+
+static INLINE unsigned
+i915_translate_blend_func(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD:
+      return BLENDFUNC_ADD;
+   case PIPE_BLEND_MIN:
+      return BLENDFUNC_MIN;
+   case PIPE_BLEND_MAX:
+      return BLENDFUNC_MAX;
+   case PIPE_BLEND_SUBTRACT:
+      return BLENDFUNC_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return BLENDFUNC_REVERSE_SUBTRACT;
+   default:
+      return 0;
+   }
+}
+
+
+static INLINE unsigned
+i915_translate_logic_op(unsigned opcode)
+{
+   switch (opcode) {
+   case PIPE_LOGICOP_CLEAR:
+      return LOGICOP_CLEAR;
+   case PIPE_LOGICOP_AND:
+      return LOGICOP_AND;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return LOGICOP_AND_RVRSE;
+   case PIPE_LOGICOP_COPY:
+      return LOGICOP_COPY;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return LOGICOP_COPY_INV;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return LOGICOP_AND_INV;
+   case PIPE_LOGICOP_NOOP:
+      return LOGICOP_NOOP;
+   case PIPE_LOGICOP_XOR:
+      return LOGICOP_XOR;
+   case PIPE_LOGICOP_OR:
+      return LOGICOP_OR;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return LOGICOP_OR_INV;
+   case PIPE_LOGICOP_NOR:
+      return LOGICOP_NOR;
+   case PIPE_LOGICOP_EQUIV:
+      return LOGICOP_EQUIV;
+   case PIPE_LOGICOP_INVERT:
+      return LOGICOP_INV;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return LOGICOP_OR_RVRSE;
+   case PIPE_LOGICOP_NAND:
+      return LOGICOP_NAND;
+   case PIPE_LOGICOP_SET:
+      return LOGICOP_SET;
+   default:
+      return LOGICOP_SET;
+   }
+}
+
+
+
+static INLINE boolean i915_validate_vertices( unsigned hw_prim, unsigned nr )
+{
+   boolean ok;
+
+   switch (hw_prim) {
+   case PRIM3D_POINTLIST:
+      ok = (nr >= 1);
+      assert(ok);
+      break;
+   case PRIM3D_LINELIST:
+      ok = (nr >= 2) && (nr % 2) == 0;
+      assert(ok);
+      break;
+   case PRIM3D_LINESTRIP:
+      ok = (nr >= 2);
+      assert(ok);
+      break;
+   case PRIM3D_TRILIST:
+      ok = (nr >= 3) && (nr % 3) == 0;
+      assert(ok);
+      break;
+   case PRIM3D_TRISTRIP:
+      ok = (nr >= 3);
+      assert(ok);
+      break;
+   case PRIM3D_TRIFAN:
+      ok = (nr >= 3);
+      assert(ok);
+      break;
+   case PRIM3D_POLY:
+      ok = (nr >= 3);
+      assert(ok);
+      break;
+   default:
+      assert(0);
+      ok = 0;
+      break;
+   }
+
+   return ok;
+}
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_state_sampler.c b/src/gallium/drivers/i915simple/i915_state_sampler.c
new file mode 100644
index 0000000000..c09c10601b
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_sampler.c
@@ -0,0 +1,299 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+
+#include "i915_state_inlines.h"
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+
+
+/*
+ * A note about min_lod & max_lod.
+ *
+ * There is a circular dependancy between the sampler state
+ * and the map state to be submitted to hw.
+ *
+ * Two condition must be meet:
+ * min_lod =< max_lod == true
+ * max_lod =< last_level == true
+ *
+ *
+ * This is all fine and dandy if it where for the fact that max_lod
+ * is set on the map state instead of the sampler state. That is
+ * the max_lod we submit on map is:
+ * max_lod = MIN2(last_level, max_lod);
+ *
+ * So we need to update the map state when we change samplers and
+ * we need to be change the sampler state when map state is changed.
+ * The first part is done by calling i915_update_texture in
+ * i915_update_samplers and the second part is done else where in
+ * code tracking the state changes.
+ */
+
+static void
+i915_update_texture(struct i915_context *i915,
+                    uint unit,
+                    const struct i915_texture *tex,
+                    const struct i915_sampler_state *sampler,
+                    uint state[6]);
+/**
+ * Compute i915 texture sampling state.
+ *
+ * Recalculate all state from scratch.  Perhaps not the most
+ * efficient, but this has gotten complex enough that we need
+ * something which is understandable and reliable.
+ * \param state  returns the 3 words of compute state
+ */
+static void update_sampler(struct i915_context *i915,
+                           uint unit,
+			   const struct i915_sampler_state *sampler,
+			   const struct i915_texture *tex,
+			   unsigned state[3] )
+{
+   const struct pipe_texture *pt = &tex->base;
+   unsigned minlod, lastlod;
+
+   /* Need to do this after updating the maps, which call the
+    * intel_finalize_mipmap_tree and hence can update firstLevel:
+    */
+   state[0] = sampler->state[0];
+   state[1] = sampler->state[1];
+   state[2] = sampler->state[2];
+
+   if (pt->format == PIPE_FORMAT_YCBCR ||
+       pt->format == PIPE_FORMAT_YCBCR_REV)
+      state[0] |= SS2_COLORSPACE_CONVERSION;
+
+   /* 3D textures don't seem to respect the border color.
+    * Fallback if there's ever a danger that they might refer to
+    * it.  
+    * 
+    * Effectively this means fallback on 3D clamp or
+    * clamp_to_border.
+    *
+    * XXX: Check if this is true on i945.  
+    * XXX: Check if this bug got fixed in release silicon.
+    */
+#if 0
+   {
+      const unsigned ws = sampler->templ->wrap_s;
+      const unsigned wt = sampler->templ->wrap_t;
+      const unsigned wr = sampler->templ->wrap_r;
+      if (pt->target == PIPE_TEXTURE_3D &&
+          (sampler->templ->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
+           sampler->templ->mag_img_filter != PIPE_TEX_FILTER_NEAREST) &&
+          (ws == PIPE_TEX_WRAP_CLAMP ||
+           wt == PIPE_TEX_WRAP_CLAMP ||
+           wr == PIPE_TEX_WRAP_CLAMP ||
+           ws == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
+           wt == PIPE_TEX_WRAP_CLAMP_TO_BORDER || 
+           wr == PIPE_TEX_WRAP_CLAMP_TO_BORDER)) {
+         if (i915->strict_conformance) {
+            assert(0);
+            /* 	    sampler->fallback = true; */
+            /* TODO */
+         }
+      }
+   }
+#endif
+
+   /* See note at the top of file */
+   minlod = sampler->minlod;
+   lastlod = pt->last_level << 4;
+
+   if (lastlod < minlod) {
+      minlod = lastlod;
+   }
+
+   state[1] |= (sampler->minlod << SS3_MIN_LOD_SHIFT);
+   state[1] |= (unit << SS3_TEXTUREMAP_INDEX_SHIFT);
+}
+
+
+void i915_update_samplers( struct i915_context *i915 )
+{
+   uint unit;
+
+   i915->current.sampler_enable_nr = 0;
+   i915->current.sampler_enable_flags = 0x0;
+
+   for (unit = 0; unit < i915->num_textures && unit < i915->num_samplers;
+        unit++) {
+      /* determine unit enable/disable by looking for a bound texture */
+      /* could also examine the fragment program? */
+      if (i915->texture[unit]) {
+	 update_sampler( i915,
+	                 unit,
+	                 i915->sampler[unit],       /* sampler state */
+	                 i915->texture[unit],        /* texture */
+	                 i915->current.sampler[unit] /* the result */
+	                 );
+	 i915_update_texture( i915,
+	                      unit,
+	                      i915->texture[unit],          /* texture */
+	                      i915->sampler[unit],          /* sampler state */
+	                      i915->current.texbuffer[unit] );
+
+	 i915->current.sampler_enable_nr++;
+	 i915->current.sampler_enable_flags |= (1 << unit);
+      }
+   }
+
+   i915->hardware_dirty |= I915_HW_SAMPLER | I915_HW_MAP;
+}
+
+
+static uint
+translate_texture_format(enum pipe_format pipeFormat)
+{
+   switch (pipeFormat) {
+   case PIPE_FORMAT_L8_UNORM:
+      return MAPSURF_8BIT | MT_8BIT_L8;
+   case PIPE_FORMAT_I8_UNORM:
+      return MAPSURF_8BIT | MT_8BIT_I8;
+   case PIPE_FORMAT_A8_UNORM:
+      return MAPSURF_8BIT | MT_8BIT_A8;
+   case PIPE_FORMAT_A8L8_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_AY88;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_RGB565;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_ARGB1555;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_ARGB4444;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return MAPSURF_32BIT | MT_32BIT_ARGB8888;
+   case PIPE_FORMAT_YCBCR_REV:
+      return (MAPSURF_422 | MT_422_YCRCB_NORMAL);
+   case PIPE_FORMAT_YCBCR:
+      return (MAPSURF_422 | MT_422_YCRCB_SWAPY);
+#if 0
+   case PIPE_FORMAT_RGB_FXT1:
+   case PIPE_FORMAT_RGBA_FXT1:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_FXT1);
+#endif
+   case PIPE_FORMAT_Z16_UNORM:
+      return (MAPSURF_16BIT | MT_16BIT_L16);
+#if 0
+   case PIPE_FORMAT_RGBA_DXT1:
+   case PIPE_FORMAT_RGB_DXT1:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_DXT1);
+   case PIPE_FORMAT_RGBA_DXT3:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_DXT2_3);
+   case PIPE_FORMAT_RGBA_DXT5:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_DXT4_5);
+#endif
+   case PIPE_FORMAT_S8Z24_UNORM:
+      return (MAPSURF_32BIT | MT_32BIT_xI824);
+   default:
+      debug_printf("i915: translate_texture_format() bad image format %x\n",
+              pipeFormat);
+      assert(0);
+      return 0;
+   }
+}
+
+
+static void
+i915_update_texture(struct i915_context *i915,
+                    uint unit,
+                    const struct i915_texture *tex,
+                    const struct i915_sampler_state *sampler,
+                    uint state[6])
+{
+   const struct pipe_texture *pt = &tex->base;
+   uint format, pitch;
+   const uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+   const uint num_levels = pt->last_level;
+   unsigned max_lod = num_levels * 4;
+   unsigned tiled = MS3_USE_FENCE_REGS;
+
+   assert(tex);
+   assert(width);
+   assert(height);
+   assert(depth);
+
+   format = translate_texture_format(pt->format);
+   pitch = tex->stride;
+
+   assert(format);
+   assert(pitch);
+
+   if (tex->tiled) {
+      assert(!((pitch - 1) & pitch));
+      tiled = MS3_TILED_SURFACE;
+   }
+
+   /* MS3 state */
+   state[0] =
+      (((height - 1) << MS3_HEIGHT_SHIFT)
+       | ((width - 1) << MS3_WIDTH_SHIFT)
+       | format
+       | tiled);
+
+   /*
+    * XXX When min_filter != mag_filter and there's just one mipmap level,
+    * set max_lod = 1 to make sure i915 chooses between min/mag filtering.
+    */
+
+   /* See note at the top of file */
+   if (max_lod > (sampler->maxlod >> 2))
+      max_lod = sampler->maxlod >> 2;
+
+   /* MS4 state */
+   state[1] =
+      ((((pitch / 4) - 1) << MS4_PITCH_SHIFT)
+       | MS4_CUBE_FACE_ENA_MASK
+       | ((max_lod) << MS4_MAX_LOD_SHIFT)
+       | ((depth - 1) << MS4_VOLUME_DEPTH_SHIFT));
+}
+
+
+void
+i915_update_textures(struct i915_context *i915)
+{
+   uint unit;
+
+   for (unit = 0; unit < i915->num_textures && unit < i915->num_samplers;
+        unit++) {
+      /* determine unit enable/disable by looking for a bound texture */
+      /* could also examine the fragment program? */
+      if (i915->texture[unit]) {
+	 i915_update_texture( i915,
+	                      unit,
+	                      i915->texture[unit],          /* texture */
+	                      i915->sampler[unit],          /* sampler state */
+	                      i915->current.texbuffer[unit] );
+      }
+   }
+
+   i915->hardware_dirty |= I915_HW_MAP;
+}
diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915simple/i915_surface.c
new file mode 100644
index 0000000000..94e2deaf61
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_surface.c
@@ -0,0 +1,126 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_context.h"
+#include "i915_blit.h"
+#include "i915_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_tile.h"
+#include "util/u_rect.h"
+
+
+/* Assumes all values are within bounds -- no checking at this level -
+ * do it higher up if required.
+ */
+static void
+i915_surface_copy(struct pipe_context *pipe,
+                  boolean do_flip,
+		  struct pipe_surface *dst,
+		  unsigned dstx, unsigned dsty,
+		  struct pipe_surface *src,
+		  unsigned srcx, unsigned srcy, unsigned width, unsigned height)
+{
+   assert( dst != src );
+   assert( dst->block.size == src->block.size );
+   assert( dst->block.width == src->block.height );
+   assert( dst->block.height == src->block.height );
+
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+      
+      const void *src_map = pipe->screen->surface_map( pipe->screen,
+                                                       src,
+                                                       PIPE_BUFFER_USAGE_CPU_READ );
+      
+      pipe_copy_rect(dst_map,
+                     &dst->block,
+                     dst->stride,
+                     dstx, dsty,
+                     width, height,
+                     src_map,
+                     do_flip ? -(int) src->stride : src->stride,
+                     srcx, do_flip ? height - 1 - srcy : srcy);
+
+      pipe->screen->surface_unmap(pipe->screen, src);
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      struct i915_texture *dst_tex = (struct i915_texture *)dst->texture;
+      struct i915_texture *src_tex = (struct i915_texture *)src->texture;
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      i915_copy_blit( i915_context(pipe),
+                      do_flip,
+                      dst->block.size,
+		      (unsigned short) src->stride, src_tex->buffer, src->offset,
+		      (unsigned short) dst->stride, dst_tex->buffer, dst->offset,
+		      (short) srcx, (short) srcy, (short) dstx, (short) dsty, (short) width, (short) height );
+   }
+}
+
+
+static void
+i915_surface_fill(struct pipe_context *pipe,
+		  struct pipe_surface *dst,
+		  unsigned dstx, unsigned dsty,
+		  unsigned width, unsigned height, unsigned value)
+{
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+
+      pipe_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
+
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      struct i915_texture *tex = (struct i915_texture *)dst->texture;
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      i915_fill_blit( i915_context(pipe),
+		      dst->block.size,
+		      (unsigned short) dst->stride,
+		      tex->buffer, dst->offset,
+		      (short) dstx, (short) dsty,
+		      (short) width, (short) height,
+		      value );
+   }
+}
+
+
+void
+i915_init_surface_functions(struct i915_context *i915)
+{
+   i915->pipe.surface_copy = i915_surface_copy;
+   i915->pipe.surface_fill = i915_surface_fill;
+}
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
new file mode 100644
index 0000000000..b2ca3a2286
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -0,0 +1,771 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "i915_context.h"
+#include "i915_texture.h"
+#include "i915_debug.h"
+#include "i915_screen.h"
+
+/*
+ * Helper function and arrays
+ */
+
+/**
+ * Initial offset for Cube map.
+ */
+static const int initial_offsets[6][2] = {
+   {0, 0},
+   {0, 2},
+   {1, 0},
+   {1, 2},
+   {1, 1},
+   {1, 3}
+};
+
+/**
+ * Step offsets for Cube map.
+ */
+static const int step_offsets[6][2] = {
+   {0, 2},
+   {0, 2},
+   {-1, 2},
+   {-1, 2},
+   {-1, 1},
+   {-1, 1}
+};
+
+static unsigned minify( unsigned d )
+{
+   return MAX2(1, d>>1);
+}
+
+static unsigned
+power_of_two(unsigned x)
+{
+   unsigned value = 1;
+   while (value < x)
+      value = value << 1;
+   return value;
+}
+
+static unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+
+/*
+ * More advanced helper funcs
+ */
+
+
+static void
+i915_miptree_set_level_info(struct i915_texture *tex,
+                             unsigned level,
+                             unsigned nr_images,
+                             unsigned w, unsigned h, unsigned d)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   assert(level < PIPE_MAX_TEXTURE_LEVELS);
+
+   pt->width[level] = w;
+   pt->height[level] = h;
+   pt->depth[level] = d;
+   
+   pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w);
+   pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h);
+
+   tex->nr_images[level] = nr_images;
+
+   /*
+   DBG("%s level %d size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+       level, w, h, d, x, y, tex->level_offset[level]);
+   */
+
+   /* Not sure when this would happen, but anyway: 
+    */
+   if (tex->image_offset[level]) {
+      FREE(tex->image_offset[level]);
+      tex->image_offset[level] = NULL;
+   }
+
+   assert(nr_images);
+   assert(!tex->image_offset[level]);
+
+   tex->image_offset[level] = (unsigned *) MALLOC(nr_images * sizeof(unsigned));
+   tex->image_offset[level][0] = 0;
+}
+
+static void
+i915_miptree_set_image_offset(struct i915_texture *tex,
+			      unsigned level, unsigned img, unsigned x, unsigned y)
+{
+   if (img == 0 && level == 0)
+      assert(x == 0 && y == 0);
+
+   assert(img < tex->nr_images[level]);
+
+   tex->image_offset[level][img] = y * tex->stride + x * tex->base.block.size;
+
+   /*
+   printf("%s level %d img %d pos %d,%d image_offset %x\n",
+       __FUNCTION__, level, img, x, y, tex->image_offset[level][img]);
+   */
+}
+
+
+/*
+ * Layout functions
+ */
+
+
+/**
+ * Special case to deal with display targets.
+ */
+static boolean
+i915_displaytarget_layout(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   if (pt->last_level > 0 || pt->block.size != 4)
+      return 0;
+
+   i915_miptree_set_level_info( tex, 0, 1,
+                                tex->base.width[0],
+                                tex->base.height[0],
+                                1 );
+   i915_miptree_set_image_offset( tex, 0, 0, 0, 0 );
+
+   if (tex->base.width[0] >= 128) {
+      tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
+      tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+#if 0 /* used for tiled display targets */
+      tex->tiled = 1;
+#endif
+   } else {
+      tex->stride = round_up(tex->base.nblocksx[0] * pt->block.size, 64);
+      tex->total_nblocksy = tex->base.nblocksy[0];
+   }
+
+   /*
+   printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+      tex->base.width[0], tex->base.height[0], pt->block.size,
+      tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
+   */
+
+   return 1;
+}
+
+static void
+i945_miptree_layout_2d( struct i915_texture *tex )
+{
+   struct pipe_texture *pt = &tex->base;
+   const int align_x = 2, align_y = 4;
+   unsigned level;
+   unsigned x = 0;
+   unsigned y = 0;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned nblocksx = pt->nblocksx[0];
+   unsigned nblocksy = pt->nblocksy[0];
+
+   /* used for tiled display targets */
+   if (0)
+      if (i915_displaytarget_layout(tex))
+	 return;
+
+   tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+
+   /* May need to adjust pitch to accomodate the placement of
+    * the 2nd mipmap level.  This occurs when the alignment
+    * constraints of mipmap placement push the right edge of the
+    * 2nd mipmap level out past the width of its parent.
+    */
+   if (pt->last_level > 0) {
+      unsigned mip1_nblocksx 
+	 = align(pf_get_nblocksx(&pt->block, minify(width)), align_x)
+         + pf_get_nblocksx(&pt->block, minify(minify(width)));
+
+      if (mip1_nblocksx > nblocksx)
+	 tex->stride = mip1_nblocksx * pt->block.size;
+   }
+
+   /* Pitch must be a whole number of dwords
+    */
+   tex->stride = align(tex->stride, 64);
+   tex->total_nblocksy = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      i915_miptree_set_level_info(tex, level, 1, width, height, 1);
+      i915_miptree_set_image_offset(tex, level, 0, x, y);
+
+      nblocksy = align(nblocksy, align_y);
+
+      /* Because the images are packed better, the final offset
+       * might not be the maximal one:
+       */
+      tex->total_nblocksy = MAX2(tex->total_nblocksy, y + nblocksy);
+
+      /* Layout_below: step right after second mipmap level.
+       */
+      if (level == 1) {
+	 x += align(nblocksx, align_x);
+      }
+      else {
+	 y += nblocksy;
+      }
+
+      width  = minify(width);
+      height = minify(height);
+      nblocksx = pf_get_nblocksx(&pt->block, width);
+      nblocksy = pf_get_nblocksy(&pt->block, height);
+   }
+}
+
+static void
+i945_miptree_layout_cube(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned level;
+
+   const unsigned nblocks = pt->nblocksx[0];
+   unsigned face;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+
+   /*
+   printf("%s %i, %i\n", __FUNCTION__, pt->width[0], pt->height[0]);
+   */
+
+   assert(width == height); /* cubemap images are square */
+
+   /*
+    * XXX Should only be used for compressed formats. But lets
+    * keep this code active just in case.
+    *
+    * Depending on the size of the largest images, pitch can be
+    * determined either by the old-style packing of cubemap faces,
+    * or the final row of 4x4, 2x2 and 1x1 faces below this.
+    */
+   if (nblocks > 32)
+      tex->stride = round_up(nblocks * pt->block.size * 2, 4);
+   else
+      tex->stride = 14 * 8 * pt->block.size;
+
+   tex->total_nblocksy = nblocks * 4;
+
+   /* Set all the levels to effectively occupy the whole rectangular region.
+   */
+   for (level = 0; level <= pt->last_level; level++) {
+      i915_miptree_set_level_info(tex, level, 6, width, height, 1);
+      width /= 2;
+      height /= 2;
+   }
+
+   for (face = 0; face < 6; face++) {
+      unsigned x = initial_offsets[face][0] * nblocks;
+      unsigned y = initial_offsets[face][1] * nblocks;
+      unsigned d = nblocks;
+
+#if 0 /* Fix and enable this code for compressed formats */
+      if (nblocks == 4 && face >= 4) {
+         y = tex->total_height - 4;
+         x = (face - 4) * 8;
+      }
+      else if (nblocks < 4 && (face > 0)) {
+         y = tex->total_height - 4;
+         x = face * 8;
+      }
+#endif
+
+      for (level = 0; level <= pt->last_level; level++) {
+         i915_miptree_set_image_offset(tex, level, face, x, y);
+
+         d >>= 1;
+
+#if 0 /* Fix and enable this code for compressed formats */
+         switch (d) {
+            case 4:
+               switch (face) {
+                  case PIPE_TEX_FACE_POS_X:
+                  case PIPE_TEX_FACE_NEG_X:
+                     x += step_offsets[face][0] * d;
+                     y += step_offsets[face][1] * d;
+                     break;
+                  case PIPE_TEX_FACE_POS_Y:
+                  case PIPE_TEX_FACE_NEG_Y:
+                     y += 12;
+                     x -= 8;
+                     break;
+                  case PIPE_TEX_FACE_POS_Z:
+                  case PIPE_TEX_FACE_NEG_Z:
+                     y = tex->total_height - 4;
+                     x = (face - 4) * 8;
+                     break;
+               }
+            case 2:
+               y = tex->total_height - 4;
+               x = 16 + face * 8;
+               break;
+
+            case 1:
+               x += 48;
+               break;
+            default:
+#endif
+               x += step_offsets[face][0] * d;
+               y += step_offsets[face][1] * d;
+#if 0
+               break;
+         }
+#endif
+      }
+   }
+}
+
+static boolean
+i915_miptree_layout(struct i915_texture * tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned level;
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_CUBE: {
+         const unsigned nblocks = pt->nblocksx[0];
+         unsigned face;
+         unsigned width = pt->width[0], height = pt->height[0];
+
+         assert(width == height); /* cubemap images are square */
+
+         /* double pitch for cube layouts */
+         tex->stride = round_up(nblocks * pt->block.size * 2, 4);
+         tex->total_nblocksy = nblocks * 4;
+
+         for (level = 0; level <= pt->last_level; level++) {
+            i915_miptree_set_level_info(tex, level, 6,
+                                         width, height,
+                                         1);
+            width /= 2;
+            height /= 2;
+         }
+
+         for (face = 0; face < 6; face++) {
+            unsigned x = initial_offsets[face][0] * nblocks;
+            unsigned y = initial_offsets[face][1] * nblocks;
+            unsigned d = nblocks;
+
+            for (level = 0; level <= pt->last_level; level++) {
+               i915_miptree_set_image_offset(tex, level, face, x, y);
+               d >>= 1;
+               x += step_offsets[face][0] * d;
+               y += step_offsets[face][1] * d;
+            }
+         }
+         break;
+      }
+   case PIPE_TEXTURE_3D:{
+         unsigned width = pt->width[0];
+         unsigned height = pt->height[0];
+         unsigned depth = pt->depth[0];
+         unsigned nblocksx = pt->nblocksx[0];
+         unsigned nblocksy = pt->nblocksy[0];
+         unsigned stack_nblocksy = 0;
+
+         /* Calculate the size of a single slice. 
+          */
+         tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+
+         /* XXX: hardware expects/requires 9 levels at minimum.
+          */
+         for (level = 0; level <= MAX2(8, pt->last_level);
+              level++) {
+            i915_miptree_set_level_info(tex, level, depth,
+                                        width, height, depth);
+
+
+            stack_nblocksy += MAX2(2, nblocksy);
+
+            width = minify(width);
+            height = minify(height);
+            depth = minify(depth);
+            nblocksx = pf_get_nblocksx(&pt->block, width);
+            nblocksy = pf_get_nblocksy(&pt->block, height);
+         }
+
+         /* Fixup depth image_offsets: 
+          */
+         depth = pt->depth[0];
+         for (level = 0; level <= pt->last_level; level++) {
+            unsigned i;
+            for (i = 0; i < depth; i++) 
+               i915_miptree_set_image_offset(tex, level, i,
+                                             0, i * stack_nblocksy);
+
+            depth = minify(depth);
+         }
+
+
+         /* Multiply slice size by texture depth for total size.  It's
+          * remarkable how wasteful of memory the i915 texture layouts
+          * are.  They are largely fixed in the i945.
+          */
+         tex->total_nblocksy = stack_nblocksy * pt->depth[0];
+         break;
+      }
+
+   default:{
+         unsigned width = pt->width[0];
+         unsigned height = pt->height[0];
+         unsigned nblocksx = pt->nblocksx[0];
+         unsigned nblocksy = pt->nblocksy[0];
+
+         tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+         tex->total_nblocksy = 0;
+
+         for (level = 0; level <= pt->last_level; level++) {
+            i915_miptree_set_level_info(tex, level, 1,
+                                        width, height, 1);
+            i915_miptree_set_image_offset(tex, level, 0,
+                                          0, tex->total_nblocksy);
+
+            nblocksy = round_up(MAX2(2, nblocksy), 2);
+
+	    tex->total_nblocksy += nblocksy;
+
+            width = minify(width);
+            height = minify(height);
+            nblocksx = pf_get_nblocksx(&pt->block, width);
+            nblocksy = pf_get_nblocksy(&pt->block, height);
+         }
+         break;
+      }
+   }
+   /*
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+       tex->pitch,
+       tex->total_nblocksy, pt->block.size, tex->stride * tex->total_nblocksy);
+   */
+
+   return TRUE;
+}
+
+
+static boolean
+i945_miptree_layout(struct i915_texture * tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned level;
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_CUBE:
+      i945_miptree_layout_cube(tex);
+      break;
+   case PIPE_TEXTURE_3D:{
+         unsigned width = pt->width[0];
+         unsigned height = pt->height[0];
+         unsigned depth = pt->depth[0];
+         unsigned nblocksx = pt->nblocksx[0];
+         unsigned nblocksy = pt->nblocksy[0];
+         unsigned pack_x_pitch, pack_x_nr;
+         unsigned pack_y_pitch;
+
+         tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+         tex->total_nblocksy = 0;
+
+         pack_y_pitch = MAX2(pt->nblocksy[0], 2);
+         pack_x_pitch = tex->stride / pt->block.size;
+         pack_x_nr = 1;
+
+         for (level = 0; level <= pt->last_level; level++) {
+            unsigned nr_images = pt->target == PIPE_TEXTURE_3D ? depth : 6;
+            int x = 0;
+            int y = 0;
+            unsigned q, j;
+
+            i915_miptree_set_level_info(tex, level, nr_images,
+                                        width, height, depth);
+
+            for (q = 0; q < nr_images;) {
+               for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+                  i915_miptree_set_image_offset(tex, level, q, x, y + tex->total_nblocksy);
+                  x += pack_x_pitch;
+               }
+
+               x = 0;
+               y += pack_y_pitch;
+            }
+
+
+            tex->total_nblocksy += y;
+
+            if (pack_x_pitch > 4) {
+               pack_x_pitch >>= 1;
+               pack_x_nr <<= 1;
+               assert(pack_x_pitch * pack_x_nr * pt->block.size <= tex->stride);
+            }
+
+            if (pack_y_pitch > 2) {
+               pack_y_pitch >>= 1;
+            }
+
+            width = minify(width);
+            height = minify(height);
+            depth = minify(depth);
+            nblocksx = pf_get_nblocksx(&pt->block, width);
+            nblocksy = pf_get_nblocksy(&pt->block, height);
+         }
+         break;
+      }
+
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_2D:
+//   case PIPE_TEXTURE_RECTANGLE:
+         i945_miptree_layout_2d(tex);
+         break;
+   default:
+      assert(0);
+      return FALSE;
+   }
+
+   /*
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+       tex->pitch,
+       tex->total_nblocksy, pt->block.size, tex->stride * tex->total_nblocksy);
+   */
+
+   return TRUE;
+}
+
+
+static struct pipe_texture *
+i915_texture_create(struct pipe_screen *screen,
+                    const struct pipe_texture *templat)
+{
+   struct i915_screen *i915screen = i915_screen(screen);
+   struct pipe_winsys *ws = screen->winsys;
+   struct i915_texture *tex = CALLOC_STRUCT(i915_texture);
+   size_t tex_size;
+
+   if (!tex)
+      return NULL;
+
+   tex->base = *templat;
+   tex->base.refcount = 1;
+   tex->base.screen = screen;
+
+   tex->base.nblocksx[0] = pf_get_nblocksx(&tex->base.block, tex->base.width[0]);
+   tex->base.nblocksy[0] = pf_get_nblocksy(&tex->base.block, tex->base.height[0]);
+   
+   if (i915screen->is_i945) {
+      if (!i945_miptree_layout(tex))
+	 goto fail;
+   } else {
+      if (!i915_miptree_layout(tex))
+	 goto fail;
+   }
+
+   tex_size = tex->stride * tex->total_nblocksy;
+
+   tex->buffer = ws->buffer_create(ws, 64,
+                                    PIPE_BUFFER_USAGE_PIXEL,
+                                    tex_size);
+
+   if (!tex->buffer)
+      goto fail;
+
+#if 0
+   void *ptr = ws->buffer_map(ws, tex->buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE);
+   memset(ptr, 0x80, tex_size);
+   ws->buffer_unmap(ws, tex->buffer);
+#endif
+
+   return &tex->base;
+
+fail:
+   FREE(tex);
+   return NULL;
+}
+
+
+static void
+i915_texture_release(struct pipe_screen *screen,
+                     struct pipe_texture **pt)
+{
+   if (!*pt)
+      return;
+
+   /*
+   DBG("%s %p refcount will be %d\n",
+       __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
+   */
+   if (--(*pt)->refcount <= 0) {
+      struct i915_texture *tex = (struct i915_texture *)*pt;
+      uint i;
+
+      /*
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
+      */
+
+      pipe_buffer_reference(screen, &tex->buffer, NULL);
+
+      for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
+         if (tex->image_offset[i])
+            FREE(tex->image_offset[i]);
+
+      FREE(tex);
+   }
+   *pt = NULL;
+}
+
+static struct pipe_surface *
+i915_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned flags)
+{
+   struct i915_texture *tex = (struct i915_texture *)pt;
+   struct pipe_surface *ps;
+   unsigned offset;  /* in bytes */
+
+   if (pt->target == PIPE_TEXTURE_CUBE) {
+      offset = tex->image_offset[level][face];
+   }
+   else if (pt->target == PIPE_TEXTURE_3D) {
+      offset = tex->image_offset[level][zslice];
+   }
+   else {
+      offset = tex->image_offset[level][0];
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      ps->refcount = 1;
+      pipe_texture_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->block = pt->block;
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = tex->stride;
+      ps->offset = offset;
+      ps->usage = flags;
+      ps->status = PIPE_SURFACE_STATUS_DEFINED;
+   }
+   return ps;
+}
+
+static struct pipe_texture *
+i915_texture_blanket(struct pipe_screen * screen,
+                     const struct pipe_texture *base,
+                     const unsigned *stride,
+                     struct pipe_buffer *buffer)
+{
+   struct i915_texture *tex;
+   assert(screen);
+
+   /* Only supports one type */
+   if (base->target != PIPE_TEXTURE_2D ||
+       base->last_level != 0 ||
+       base->depth[0] != 1) {
+      return NULL;
+   }
+
+   tex = CALLOC_STRUCT(i915_texture);
+   if (!tex)
+      return NULL;
+
+   tex->base = *base;
+   tex->base.refcount = 1;
+   tex->base.screen = screen;
+
+   tex->stride = stride[0];
+
+   i915_miptree_set_level_info(tex, 0, 1, base->width[0], base->height[0], 1);
+   i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
+
+   pipe_buffer_reference(screen, &tex->buffer, buffer);
+
+   return &tex->base;
+}
+
+void
+i915_init_texture_functions(struct i915_context *i915)
+{
+//   i915->pipe.texture_update = i915_texture_update;
+}
+
+static void
+i915_tex_surface_release(struct pipe_screen *screen,
+                         struct pipe_surface **surface)
+{
+   struct pipe_surface *surf = *surface;
+
+   if (--surf->refcount == 0) {
+
+      /* This really should not be possible, but it's actually
+       * happening quite a bit...  Will fix.
+       */
+      if (surf->status == PIPE_SURFACE_STATUS_CLEAR) {
+         debug_printf("XXX destroying a surface with pending clears...\n");
+         assert(0);
+      }
+
+      pipe_texture_reference(&surf->texture, NULL);
+      FREE(surf);
+   }
+
+   *surface = NULL;
+}
+
+void
+i915_init_screen_texture_functions(struct pipe_screen *screen)
+{
+   screen->texture_create = i915_texture_create;
+   screen->texture_release = i915_texture_release;
+   screen->get_tex_surface = i915_get_tex_surface;
+   screen->texture_blanket = i915_texture_blanket;
+   screen->tex_surface_release = i915_tex_surface_release;
+}
diff --git a/src/gallium/drivers/i915simple/i915_texture.h b/src/gallium/drivers/i915simple/i915_texture.h
new file mode 100644
index 0000000000..7225016a9f
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_texture.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_TEXTURE_H
+#define I915_TEXTURE_H
+
+struct i915_context;
+struct pipe_screen;
+
+
+extern void
+i915_init_texture_functions(struct i915_context *i915);
+
+
+extern void
+i915_init_screen_texture_functions(struct pipe_screen *screen);
+
+
+#endif /* I915_TEXTURE_H */
diff --git a/src/gallium/drivers/i915simple/i915_winsys.h b/src/gallium/drivers/i915simple/i915_winsys.h
new file mode 100644
index 0000000000..81904c2a74
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_winsys.h
@@ -0,0 +1,121 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * This is the interface that i915simple requires any window system
+ * hosting it to implement.  This is the only include file in i915simple
+ * which is public.
+ * 
+ */
+
+#ifndef I915_WINSYS_H
+#define I915_WINSYS_H
+
+
+#include "pipe/p_defines.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+/* Pipe drivers are (meant to be!) independent of both GL and the
+ * window system.  The window system provides a buffer manager and a
+ * set of additional hooks for things like command buffer submission,
+ * etc.
+ *
+ * There clearly has to be some agreement between the window system
+ * driver and the hardware driver about the format of command buffers,
+ * etc.
+ */
+
+struct i915_batchbuffer;
+struct pipe_buffer;
+struct pipe_fence_handle;
+struct pipe_winsys;
+struct pipe_screen;
+
+
+/**
+ * Additional winsys interface for i915simple.
+ * 
+ * It is an over-simple batchbuffer mechanism.  Will want to improve the
+ * performance of this, perhaps based on the cmdstream stuff.  It
+ * would be pretty impossible to implement swz on top of this
+ * interface.
+ *
+ * Will also need additions/changes to implement static/dynamic
+ * indirect state.
+ */
+struct i915_winsys {
+
+   void (*destroy)( struct i915_winsys *sws );
+   
+   /**
+    * Get the current batch buffer from the winsys.
+    */
+   struct i915_batchbuffer *(*batch_get)( struct i915_winsys *sws );
+
+   /**
+    * Emit a relocation to a buffer.
+    * 
+    * Used not only when the buffer addresses are not pinned, but also to 
+    * ensure refered buffers will not be destroyed until the current batch 
+    * buffer execution is finished.
+    *
+    * The access flags is a combination of I915_BUFFER_ACCESS_WRITE and 
+    * I915_BUFFER_ACCESS_READ macros.
+    */
+   void (*batch_reloc)( struct i915_winsys *sws,
+			struct pipe_buffer *buf,
+			unsigned access_flags,
+			unsigned delta );
+
+   /**
+    * Flush the batch.
+    */
+   void (*batch_flush)( struct i915_winsys *sws,
+                        struct pipe_fence_handle **fence );
+};
+
+#define I915_BUFFER_ACCESS_WRITE   0x1 
+#define I915_BUFFER_ACCESS_READ    0x2
+
+#define I915_BUFFER_USAGE_LIT_VERTEX  (PIPE_BUFFER_USAGE_CUSTOM << 0)
+
+
+struct pipe_context *i915_create_context( struct pipe_screen *,
+                                          struct pipe_winsys *,
+                                          struct i915_winsys * );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif 
diff --git a/src/gallium/drivers/i965simple/Makefile b/src/gallium/drivers/i965simple/Makefile
new file mode 100644
index 0000000000..e97146e57c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/Makefile
@@ -0,0 +1,54 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i965simple
+
+C_SOURCES = \
+	brw_blit.c \
+	brw_flush.c \
+	brw_screen.c \
+	brw_surface.c \
+	brw_cc.c \
+	brw_clip.c \
+	brw_clip_line.c \
+	brw_clip_point.c \
+	brw_clip_state.c \
+	brw_clip_tri.c \
+	brw_clip_util.c \
+	brw_context.c \
+	brw_curbe.c \
+	brw_draw.c \
+	brw_draw_upload.c \
+	brw_eu.c \
+	brw_eu_debug.c \
+	brw_eu_emit.c \
+	brw_eu_util.c \
+	brw_gs.c \
+	brw_gs_emit.c \
+	brw_gs_state.c \
+	brw_misc_state.c \
+	brw_sf.c \
+	brw_sf_emit.c \
+	brw_sf_state.c \
+	brw_state.c \
+	brw_state_batch.c \
+	brw_state_cache.c \
+	brw_state_pool.c \
+	brw_state_upload.c \
+	brw_tex_layout.c \
+	brw_urb.c \
+	brw_util.c \
+	brw_vs.c \
+	brw_vs_emit.c \
+	brw_vs_state.c \
+	brw_wm.c \
+	brw_wm_iz.c \
+	brw_wm_decl.c \
+	brw_wm_glsl.c \
+	brw_wm_sampler_state.c \
+	brw_wm_state.c \
+	brw_wm_surface_state.c
+
+include ../../Makefile.template
+
+symlinks:
diff --git a/src/gallium/drivers/i965simple/SConscript b/src/gallium/drivers/i965simple/SConscript
new file mode 100644
index 0000000000..43fc2a4005
--- /dev/null
+++ b/src/gallium/drivers/i965simple/SConscript
@@ -0,0 +1,54 @@
+Import('*')
+
+env = env.Clone()
+
+i965simple = env.ConvenienceLibrary(
+	target = 'i965simple',
+	source = [
+		'brw_blit.c',
+		'brw_cc.c',
+		'brw_clip.c',
+		'brw_clip_line.c',
+		'brw_clip_point.c',
+		'brw_clip_state.c',
+		'brw_clip_tri.c',
+		'brw_clip_util.c',
+		'brw_context.c',
+		'brw_curbe.c',
+		'brw_draw.c',
+		'brw_draw_upload.c',
+		'brw_eu.c',
+		'brw_eu_debug.c',
+		'brw_eu_emit.c',
+		'brw_eu_util.c',
+		'brw_flush.c',
+		'brw_gs.c',
+		'brw_gs_emit.c',
+		'brw_gs_state.c',
+		'brw_misc_state.c',
+		'brw_screen.c',
+		'brw_sf.c',
+		'brw_sf_emit.c',
+		'brw_sf_state.c',
+		'brw_state.c',
+		'brw_state_batch.c',
+		'brw_state_cache.c',
+		'brw_state_pool.c',
+		'brw_state_upload.c',
+		'brw_surface.c',
+		'brw_tex_layout.c',
+		'brw_urb.c',
+		'brw_util.c',
+		'brw_vs.c',
+		'brw_vs_emit.c',
+		'brw_vs_state.c',
+		'brw_wm.c',
+		'brw_wm_decl.c',
+		'brw_wm_glsl.c',
+		'brw_wm_iz.c',
+		'brw_wm_sampler_state.c',
+		'brw_wm_state.c',
+		'brw_wm_surface_state.c',
+	])
+
+Export('i965simple')
diff --git a/src/gallium/drivers/i965simple/brw_batch.h b/src/gallium/drivers/i965simple/brw_batch.h
new file mode 100644
index 0000000000..5f5932a488
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_batch.h
@@ -0,0 +1,59 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef BRW_BATCH_H
+#define BRW_BATCH_H
+
+#include "brw_winsys.h"
+
+#define BATCH_LOCALS
+
+#define INTEL_BATCH_NO_CLIPRECTS 0x1
+#define INTEL_BATCH_CLIPRECTS    0x2
+
+#define BEGIN_BATCH( dwords, relocs ) \
+   brw->winsys->batch_start(brw->winsys, dwords, relocs)
+
+#define OUT_BATCH( dword ) \
+   brw->winsys->batch_dword(brw->winsys, dword)
+
+#define OUT_RELOC( buf, flags, delta ) \
+   brw->winsys->batch_reloc(brw->winsys, buf, flags, delta)
+
+#define ADVANCE_BATCH() \
+   brw->winsys->batch_end( brw->winsys )
+
+/* XXX: this is bogus - need proper handling for out-of-memory in batchbuffer.
+ */
+#define FLUSH_BATCH(fence) do {				\
+   brw->winsys->batch_flush(brw->winsys, fence);	\
+   brw->hardware_dirty = ~0;				\
+} while (0)
+
+#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->winsys, (s), sizeof(*(s)))
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_blit.c b/src/gallium/drivers/i965simple/brw_blit.c
new file mode 100644
index 0000000000..4d11f8d2ab
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_blit.c
@@ -0,0 +1,218 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "brw_batch.h"
+#include "brw_blit.h"
+#include "brw_context.h"
+#include "brw_reg.h"
+
+#include "pipe/p_context.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+void brw_fill_blit(struct brw_context *brw,
+                   unsigned cpp,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned dst_offset,
+                   boolean dst_tiled,
+                   short x, short y,
+                   short w, short h,
+                   unsigned color)
+{
+   unsigned BR13, CMD;
+   BATCH_LOCALS;
+
+   dst_pitch *= cpp;
+
+   switch(cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (0xF0 << 16) | (1<<24);
+      CMD = XY_COLOR_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (0xF0 << 16) | (1<<24) | (1<<25);
+      CMD = XY_COLOR_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+      break;
+   default:
+      return;
+   }
+
+   if (dst_tiled) {
+      CMD |= XY_DST_TILED;
+      dst_pitch /= 4;
+   }
+
+   BEGIN_BATCH(6, INTEL_BATCH_NO_CLIPRECTS);
+   OUT_BATCH( CMD );
+   OUT_BATCH( dst_pitch | BR13 );
+   OUT_BATCH( (y << 16) | x );
+   OUT_BATCH( ((y+h) << 16) | (x+w) );
+   OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE, dst_offset );
+   OUT_BATCH( color );
+   ADVANCE_BATCH();
+}
+
+static unsigned translate_raster_op(unsigned logicop)
+{
+   switch(logicop) {
+   case PIPE_LOGICOP_CLEAR: return 0x00;
+   case PIPE_LOGICOP_AND: return 0x88;
+   case PIPE_LOGICOP_AND_REVERSE: return 0x44;
+   case PIPE_LOGICOP_COPY: return 0xCC;
+   case PIPE_LOGICOP_AND_INVERTED: return 0x22;
+   case PIPE_LOGICOP_NOOP: return 0xAA;
+   case PIPE_LOGICOP_XOR: return 0x66;
+   case PIPE_LOGICOP_OR: return 0xEE;
+   case PIPE_LOGICOP_NOR: return 0x11;
+   case PIPE_LOGICOP_EQUIV: return 0x99;
+   case PIPE_LOGICOP_INVERT: return 0x55;
+   case PIPE_LOGICOP_OR_REVERSE: return 0xDD;
+   case PIPE_LOGICOP_COPY_INVERTED: return 0x33;
+   case PIPE_LOGICOP_OR_INVERTED: return 0xBB;
+   case PIPE_LOGICOP_NAND: return 0x77;
+   case PIPE_LOGICOP_SET: return 0xFF;
+   default: return 0;
+   }
+}
+
+
+/* Copy BitBlt
+ */
+void brw_copy_blit(struct brw_context *brw,
+                   unsigned do_flip,
+                   unsigned cpp,
+                   short src_pitch,
+                   struct pipe_buffer *src_buffer,
+                   unsigned  src_offset,
+                   boolean src_tiled,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned  dst_offset,
+                   boolean dst_tiled,
+                   short src_x, short src_y,
+                   short dst_x, short dst_y,
+                   short w, short h,
+                   unsigned logic_op)
+{
+   unsigned CMD, BR13;
+   int dst_y2 = dst_y + h;
+   int dst_x2 = dst_x + w;
+   BATCH_LOCALS;
+
+
+   DBG("%s src:buf(%d)/%d %d,%d dst:buf(%d)/%d %d,%d sz:%dx%d op:%d\n",
+       __FUNCTION__,
+       src_buffer, src_pitch, src_x, src_y,
+       dst_buffer, dst_pitch, dst_x, dst_y,
+       w,h,logic_op);
+
+   assert( logic_op - PIPE_LOGICOP_CLEAR >= 0 );
+   assert( logic_op - PIPE_LOGICOP_CLEAR < 0x10 );
+
+   src_pitch *= cpp;
+   dst_pitch *= cpp;
+
+   switch(cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (translate_raster_op(logic_op) << 16) | (1<<24);
+      CMD = XY_SRC_COPY_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (translate_raster_op(logic_op) << 16) | (1<<24) |
+	  (1<<25);
+      CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+      break;
+   default:
+      return;
+   }
+
+   if (src_tiled) {
+      CMD |= XY_SRC_TILED;
+      src_pitch /= 4;
+   }
+
+   if (dst_tiled) {
+      CMD |= XY_DST_TILED;
+      dst_pitch /= 4;
+   }
+
+   if (dst_y2 < dst_y ||
+       dst_x2 < dst_x) {
+      return;
+   }
+
+   dst_pitch &= 0xffff;
+   src_pitch &= 0xffff;
+
+   /* Initial y values don't seem to work with negative pitches.  If
+    * we adjust the offsets manually (below), it seems to work fine.
+    *
+    * On the other hand, if we always adjust, the hardware doesn't
+    * know which blit directions to use, so overlapping copypixels get
+    * the wrong result.
+    */
+   if (dst_pitch > 0 && src_pitch > 0) {
+      BEGIN_BATCH(8, INTEL_BATCH_NO_CLIPRECTS);
+      OUT_BATCH( CMD );
+      OUT_BATCH( dst_pitch | BR13 );
+      OUT_BATCH( (dst_y << 16) | dst_x );
+      OUT_BATCH( (dst_y2 << 16) | dst_x2 );
+      OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE,
+		 dst_offset );
+      OUT_BATCH( (src_y << 16) | src_x );
+      OUT_BATCH( src_pitch );
+      OUT_RELOC( src_buffer, BRW_BUFFER_ACCESS_READ,
+		 src_offset );
+      ADVANCE_BATCH();
+   }
+   else {
+      BEGIN_BATCH(8, INTEL_BATCH_NO_CLIPRECTS);
+      OUT_BATCH( CMD );
+      OUT_BATCH( (dst_pitch & 0xffff) | BR13 );
+      OUT_BATCH( (0 << 16) | dst_x );
+      OUT_BATCH( (h << 16) | dst_x2 );
+      OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE,
+		 dst_offset + dst_y * dst_pitch );
+      OUT_BATCH( (src_pitch & 0xffff) );
+      OUT_RELOC( src_buffer, BRW_BUFFER_ACCESS_READ,
+		 src_offset + src_y * src_pitch );
+      ADVANCE_BATCH();
+   }
+}
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_blit.h b/src/gallium/drivers/i965simple/brw_blit.h
new file mode 100644
index 0000000000..111c5d91d3
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_blit.h
@@ -0,0 +1,33 @@
+#ifndef BRW_BLIT_H
+#define BRW_BLIT_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_buffer;
+struct brw_context;
+
+void brw_fill_blit(struct brw_context *intel,
+                   unsigned cpp,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned dst_offset,
+                   boolean dst_tiled,
+                   short x, short y,
+                   short w, short h,
+                   unsigned color);
+void brw_copy_blit(struct brw_context *intel,
+                   unsigned do_flip,
+                   unsigned cpp,
+                   short src_pitch,
+                   struct pipe_buffer *src_buffer,
+                   unsigned  src_offset,
+                   boolean src_tiled,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned  dst_offset,
+                   boolean dst_tiled,
+                   short src_x, short src_y,
+                   short dst_x, short dst_y,
+                   short w, short h,
+                   unsigned logic_op);
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_cc.c b/src/gallium/drivers/i965simple/brw_cc.c
new file mode 100644
index 0000000000..3668123e2e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_cc.c
@@ -0,0 +1,269 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_util.h"
+
+
+static int brw_translate_compare_func(int func)
+{
+   switch(func) {
+   case PIPE_FUNC_NEVER:
+      return BRW_COMPAREFUNCTION_NEVER;
+   case PIPE_FUNC_LESS:
+      return BRW_COMPAREFUNCTION_LESS;
+   case PIPE_FUNC_LEQUAL:
+      return BRW_COMPAREFUNCTION_LEQUAL;
+   case PIPE_FUNC_GREATER:
+      return BRW_COMPAREFUNCTION_GREATER;
+   case PIPE_FUNC_GEQUAL:
+      return BRW_COMPAREFUNCTION_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:
+      return BRW_COMPAREFUNCTION_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:
+      return BRW_COMPAREFUNCTION_EQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return BRW_COMPAREFUNCTION_ALWAYS;
+   }
+
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
+   return BRW_COMPAREFUNCTION_ALWAYS;
+}
+
+static int brw_translate_stencil_op(int op)
+{
+   switch(op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return BRW_STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return BRW_STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return BRW_STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:
+      return BRW_STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:
+      return BRW_STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return BRW_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return BRW_STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:
+      return BRW_STENCILOP_INVERT;
+   default:
+      return BRW_STENCILOP_ZERO;
+   }
+}
+
+
+static int brw_translate_logic_op(int opcode)
+{
+   switch(opcode) {
+   case PIPE_LOGICOP_CLEAR:
+      return BRW_LOGICOPFUNCTION_CLEAR;
+   case PIPE_LOGICOP_AND:
+      return BRW_LOGICOPFUNCTION_AND;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return BRW_LOGICOPFUNCTION_AND_REVERSE;
+   case PIPE_LOGICOP_COPY:
+      return BRW_LOGICOPFUNCTION_COPY;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return BRW_LOGICOPFUNCTION_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return BRW_LOGICOPFUNCTION_AND_INVERTED;
+   case PIPE_LOGICOP_NOOP:
+      return BRW_LOGICOPFUNCTION_NOOP;
+   case PIPE_LOGICOP_XOR:
+      return BRW_LOGICOPFUNCTION_XOR;
+   case PIPE_LOGICOP_OR:
+      return BRW_LOGICOPFUNCTION_OR;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return BRW_LOGICOPFUNCTION_OR_INVERTED;
+   case PIPE_LOGICOP_NOR:
+      return BRW_LOGICOPFUNCTION_NOR;
+   case PIPE_LOGICOP_EQUIV:
+      return BRW_LOGICOPFUNCTION_EQUIV;
+   case PIPE_LOGICOP_INVERT:
+      return BRW_LOGICOPFUNCTION_INVERT;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return BRW_LOGICOPFUNCTION_OR_REVERSE;
+   case PIPE_LOGICOP_NAND:
+      return BRW_LOGICOPFUNCTION_NAND;
+   case PIPE_LOGICOP_SET:
+      return BRW_LOGICOPFUNCTION_SET;
+   default:
+      return BRW_LOGICOPFUNCTION_SET;
+   }
+}
+
+
+static void upload_cc_vp( struct brw_context *brw )
+{
+   struct brw_cc_viewport ccv;
+
+   memset(&ccv, 0, sizeof(ccv));
+
+   ccv.min_depth = 0.0;
+   ccv.max_depth = 1.0;
+
+   brw->cc.vp_gs_offset = brw_cache_data( &brw->cache[BRW_CC_VP], &ccv );
+}
+
+const struct brw_tracked_state brw_cc_vp = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_cc_vp
+};
+
+
+static void upload_cc_unit( struct brw_context *brw )
+{
+   struct brw_cc_unit_state cc;
+
+   memset(&cc, 0, sizeof(cc));
+
+   /* BRW_NEW_DEPTH_STENCIL */
+   if (brw->attribs.DepthStencil->stencil[0].enabled) {
+      cc.cc0.stencil_enable = brw->attribs.DepthStencil->stencil[0].enabled;
+      cc.cc0.stencil_func = brw_translate_compare_func(brw->attribs.DepthStencil->stencil[0].func);
+      cc.cc0.stencil_fail_op = brw_translate_stencil_op(brw->attribs.DepthStencil->stencil[0].fail_op);
+      cc.cc0.stencil_pass_depth_fail_op = brw_translate_stencil_op(
+         brw->attribs.DepthStencil->stencil[0].zfail_op);
+      cc.cc0.stencil_pass_depth_pass_op = brw_translate_stencil_op(
+         brw->attribs.DepthStencil->stencil[0].zpass_op);
+      cc.cc1.stencil_ref = brw->attribs.DepthStencil->stencil[0].ref_value;
+      cc.cc1.stencil_write_mask = brw->attribs.DepthStencil->stencil[0].writemask;
+      cc.cc1.stencil_test_mask = brw->attribs.DepthStencil->stencil[0].valuemask;
+
+      if (brw->attribs.DepthStencil->stencil[1].enabled) {
+	 cc.cc0.bf_stencil_enable = brw->attribs.DepthStencil->stencil[1].enabled;
+	 cc.cc0.bf_stencil_func = brw_translate_compare_func(
+            brw->attribs.DepthStencil->stencil[1].func);
+	 cc.cc0.bf_stencil_fail_op = brw_translate_stencil_op(
+            brw->attribs.DepthStencil->stencil[1].fail_op);
+	 cc.cc0.bf_stencil_pass_depth_fail_op = brw_translate_stencil_op(
+            brw->attribs.DepthStencil->stencil[1].zfail_op);
+	 cc.cc0.bf_stencil_pass_depth_pass_op = brw_translate_stencil_op(
+            brw->attribs.DepthStencil->stencil[1].zpass_op);
+	 cc.cc1.bf_stencil_ref = brw->attribs.DepthStencil->stencil[1].ref_value;
+	 cc.cc2.bf_stencil_write_mask = brw->attribs.DepthStencil->stencil[1].writemask;
+	 cc.cc2.bf_stencil_test_mask = brw->attribs.DepthStencil->stencil[1].valuemask;
+      }
+
+      /* Not really sure about this:
+       */
+      if (brw->attribs.DepthStencil->stencil[0].writemask ||
+	  brw->attribs.DepthStencil->stencil[1].writemask)
+	 cc.cc0.stencil_write_enable = 1;
+   }
+
+   /* BRW_NEW_BLEND */
+   if (brw->attribs.Blend->logicop_enable) {
+      cc.cc2.logicop_enable = 1;
+      cc.cc5.logicop_func = brw_translate_logic_op( brw->attribs.Blend->logicop_func );
+   }
+   else if (brw->attribs.Blend->blend_enable) {
+      int eqRGB = brw->attribs.Blend->rgb_func;
+      int eqA = brw->attribs.Blend->alpha_func;
+      int srcRGB = brw->attribs.Blend->rgb_src_factor;
+      int dstRGB = brw->attribs.Blend->rgb_dst_factor;
+      int srcA = brw->attribs.Blend->alpha_src_factor;
+      int dstA = brw->attribs.Blend->alpha_dst_factor;
+
+      if (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX) {
+	 srcRGB = dstRGB = PIPE_BLENDFACTOR_ONE;
+      }
+
+      if (eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX) {
+	 srcA = dstA = PIPE_BLENDFACTOR_ONE;
+      }
+
+      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
+      cc.cc6.blend_function = brw_translate_blend_equation( eqRGB );
+
+      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
+      cc.cc5.ia_blend_function = brw_translate_blend_equation( eqA );
+
+      cc.cc3.blend_enable = 1;
+      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
+				dstA != dstRGB ||
+				eqA != eqRGB);
+   }
+   
+   /* BRW_NEW_ALPHATEST
+    */
+   if (brw->attribs.DepthStencil->alpha.enabled) {
+      cc.cc3.alpha_test = 1;
+      cc.cc3.alpha_test_func = 
+	 brw_translate_compare_func(brw->attribs.DepthStencil->alpha.func);
+
+      cc.cc7.alpha_ref.ub[0] = float_to_ubyte(brw->attribs.DepthStencil->alpha.ref_value);
+
+      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+   }
+
+   if (brw->attribs.Blend->dither) {
+      cc.cc5.dither_enable = 1;
+      cc.cc6.y_dither_offset = 0;
+      cc.cc6.x_dither_offset = 0;
+   }
+
+   if (brw->attribs.DepthStencil->depth.enabled) {
+      cc.cc2.depth_test = brw->attribs.DepthStencil->depth.enabled;
+      cc.cc2.depth_test_function = brw_translate_compare_func(brw->attribs.DepthStencil->depth.func);
+      cc.cc2.depth_write_enable = brw->attribs.DepthStencil->depth.writemask;
+   }
+
+   /* CACHE_NEW_CC_VP */
+   cc.cc4.cc_viewport_state_offset =  brw->cc.vp_gs_offset >> 5;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      cc.cc5.statistics_enable = 1;
+
+   brw->cc.state_gs_offset = brw_cache_data( &brw->cache[BRW_CC_UNIT], &cc );
+}
+
+const struct brw_tracked_state brw_cc_unit = {
+   .dirty = {
+      .brw = BRW_NEW_DEPTH_STENCIL | BRW_NEW_BLEND | BRW_NEW_ALPHA_TEST,
+      .cache = CACHE_NEW_CC_VP
+   },
+   .update = upload_cc_unit
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_clip.c b/src/gallium/drivers/i965simple/brw_clip.c
new file mode 100644
index 0000000000..268124cc53
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip.c
@@ -0,0 +1,206 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_clip.h"
+
+#define FRONT_UNFILLED_BIT  0x1
+#define BACK_UNFILLED_BIT   0x2
+
+
+static void compile_clip_prog( struct brw_context *brw,
+			     struct brw_clip_prog_key *key )
+{
+   struct brw_clip_compile c;
+   const unsigned *program;
+   unsigned program_size;
+   unsigned delta;
+   unsigned i;
+
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(&c.func);
+
+   c.func.single_program_flow = 1;
+
+   c.key = *key;
+
+
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.header_position_offset = ATTR_SIZE;
+
+   for (i = 0, delta = REG_SIZE; i < PIPE_MAX_SHADER_OUTPUTS; i++)
+      if (c.key.attrs & (1<<i)) {
+	 c.offset[i] = delta;
+	 delta += ATTR_SIZE;
+      }
+
+   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Would ideally have the option of producing a program which could
+    * do all three:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_TRIANGLES:
+#if 0
+      if (key->do_unfilled)
+	 brw_emit_unfilled_clip( &c );
+      else
+#endif
+	 brw_emit_tri_clip( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      brw_emit_line_clip( &c );
+      break;
+   case PIPE_PRIM_POINTS:
+      brw_emit_point_clip( &c );
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   brw->clip.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_CLIP_PROG],
+						&c.key,
+						sizeof(c.key),
+						program,
+						program_size,
+						&c.prog_data,
+						&brw->clip.prog_data );
+}
+
+
+static boolean search_cache( struct brw_context *brw,
+			       struct brw_clip_prog_key *key )
+{
+   return brw_search_cache(&brw->cache[BRW_CLIP_PROG],
+			   key, sizeof(*key),
+			   &brw->clip.prog_data,
+			   &brw->clip.prog_gs_offset);
+}
+
+
+
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_clip_prog(struct brw_context *brw)
+{
+   struct brw_clip_prog_key key;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key:
+    */
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   key.primitive = brw->reduced_primitive;
+   /* CACHE_NEW_VS_PROG */
+   key.attrs = brw->vs.prog_data->outputs_written;
+   /* BRW_NEW_RASTER */
+   key.do_flat_shading = (brw->attribs.Raster->flatshade);
+   /* BRW_NEW_CLIP */
+   key.nr_userclip = brw->attribs.Clip.nr; /* XXX */
+
+#if 0
+   key.clip_mode = BRW_CLIPMODE_NORMAL;
+
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (brw->attribs.Raster->cull_mode == PIPE_WINDING_BOTH)
+	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
+      else {
+         if (brw->attribs.Raster->fill_cw != PIPE_POLYGON_MODE_FILL ||
+             brw->attribs.Raster->fill_ccw != PIPE_POLYGON_MODE_FILL)
+            key.do_unfilled = 1;
+
+	 /* Most cases the fixed function units will handle.  Cases where
+	  * one or more polygon faces are unfilled will require help:
+	  */
+	 if (key.do_unfilled) {
+	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+
+	    if (brw->attribs.Raster->offset_cw ||
+                brw->attribs.Raster->offset_ccw) {
+	       key.offset_units = brw->attribs.Raster->offset_units;
+	       key.offset_factor = brw->attribs.Raster->offset_scale;
+	    }
+            key.fill_ccw = brw->attribs.Raster->fill_ccw;
+            key.fill_cw = brw->attribs.Raster->fill_cw;
+            key.offset_ccw = brw->attribs.Raster->offset_ccw;
+            key.offset_cw = brw->attribs.Raster->offset_cw;
+            if (brw->attribs.Raster->light_twoside &&
+                key.fill_cw != CLIP_CULL)
+               key.copy_bfc_cw = 1;
+	 }
+      }
+   }
+#else
+   key.clip_mode = BRW_CLIPMODE_ACCEPT_ALL;
+#endif
+
+   if (!search_cache(brw, &key))
+      compile_clip_prog( brw, &key );
+}
+
+const struct brw_tracked_state brw_clip_prog = {
+   .dirty = {
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_CLIP |
+		BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .update = upload_clip_prog
+};
diff --git a/src/gallium/drivers/i965simple/brw_clip.h b/src/gallium/drivers/i965simple/brw_clip.h
new file mode 100644
index 0000000000..d70fc094ff
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip.h
@@ -0,0 +1,170 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef BRW_CLIP_H
+#define BRW_CLIP_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_VERTS (3+6+6)	
+
+/* Note that if unfilled primitives are being emitted, we have to fix
+ * up polygon offset and flatshading at this point:
+ */
+struct brw_clip_prog_key {
+   unsigned attrs:32;		
+   unsigned primitive:4;
+   unsigned nr_userclip:3;
+   unsigned do_flat_shading:1;
+   unsigned do_unfilled:1;
+   unsigned fill_cw:2;		/* includes cull information */
+   unsigned fill_ccw:2;		/* includes cull information */
+   unsigned offset_cw:1;
+   unsigned offset_ccw:1;
+   unsigned pad0:17;
+
+   unsigned copy_bfc_cw:1;
+   unsigned copy_bfc_ccw:1;
+   unsigned clip_mode:3;
+   unsigned pad1:27;
+   
+   float offset_factor;
+   float offset_units;
+};
+
+
+#define CLIP_LINE   0
+#define CLIP_POINT  1
+#define CLIP_FILL   2
+#define CLIP_CULL   3
+
+
+#define PRIM_MASK  (0x1f)
+
+struct brw_clip_compile {
+   struct brw_compile func;
+   struct brw_clip_prog_key key;
+   struct brw_clip_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_VERTS];
+
+      struct brw_reg t;
+      struct brw_reg t0, t1;
+      struct brw_reg dp0, dp1;
+
+      struct brw_reg dpPrev;
+      struct brw_reg dp;
+      struct brw_reg loopcount;
+      struct brw_reg nr_verts;
+      struct brw_reg planemask;
+
+      struct brw_reg inlist;
+      struct brw_reg outlist;
+      struct brw_reg freelist;
+
+      struct brw_reg dir;
+      struct brw_reg tmp0, tmp1;
+      struct brw_reg offset;
+      
+      struct brw_reg fixed_planes;
+      struct brw_reg plane_equation;
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   unsigned nr_attrs;
+   unsigned nr_regs;
+   unsigned nr_bytes;
+
+   unsigned first_tmp;
+   unsigned last_tmp;
+
+   boolean need_direction;
+
+   unsigned last_mrf;
+
+   unsigned header_position_offset;
+   unsigned offset[PIPE_MAX_ATTRIBS];
+};
+
+#define ATTR_SIZE  (4*4)
+
+/* Points are only culled, so no need for a clip routine, however it
+ * works out easier to have a dummy one.
+ */
+void brw_emit_unfilled_clip( struct brw_clip_compile *c );
+void brw_emit_tri_clip( struct brw_clip_compile *c );
+void brw_emit_line_clip( struct brw_clip_compile *c );
+void brw_emit_point_clip( struct brw_clip_compile *c );
+
+/* brw_clip_tri.c, for use by the unfilled clip routine:
+ */
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
+void brw_clip_tri( struct brw_clip_compile *c );
+void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
+			      unsigned nr_verts );
+
+
+/* Utils:
+ */
+
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     boolean force_edgeflag );
+
+void brw_clip_init_planes( struct brw_clip_compile *c );
+
+void brw_clip_emit_vue(struct brw_clip_compile *c, 
+		       struct brw_indirect vert,
+		       boolean allocate,
+		       boolean eot,
+		       unsigned header);
+
+void brw_clip_kill_thread(struct brw_clip_compile *c);
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
+
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   unsigned to, unsigned from );
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_clip_line.c b/src/gallium/drivers/i965simple/brw_clip_line.c
new file mode 100644
index 0000000000..75d9e5fcda
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_line.c
@@ -0,0 +1,245 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
+{
+   unsigned i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < 4; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.t           = brw_vec1_grf(i, 0);
+   c->reg.t0          = brw_vec1_grf(i, 1);
+   c->reg.t1          = brw_vec1_grf(i, 2);
+   c->reg.planemask   = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dp0         = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp1         = brw_vec1_grf(i, 4);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+/* Line clipping, more or less following the following algorithm:
+ *
+ *  for (p=0;p<MAX_PLANES;p++) {
+ *     if (clipmask & (1 << p)) {
+ *        float dp0 = DOTPROD( vtx0, plane[p] );
+ *        float dp1 = DOTPROD( vtx1, plane[p] );
+ *
+ *        if (IS_NEGATIVE(dp1)) {
+ *           float t = dp1 / (dp1 - dp0);
+ *           if (t > t1) t1 = t;
+ *        } else {
+ *           float t = dp0 / (dp0 - dp1);
+ *           if (t > t0) t0 = t;
+ *        }
+ *
+ *        if (t0 + t1 >= 1.0)
+ *           return;
+ *     }
+ *  }
+ *
+ *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
+ *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
+ *
+ */
+static void clip_and_emit_line( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx0     = brw_indirect(0, 0);
+   struct brw_indirect vtx1      = brw_indirect(1, 0);
+   struct brw_indirect newvtx0   = brw_indirect(2, 0);
+   struct brw_indirect newvtx1   = brw_indirect(3, 0);
+   struct brw_indirect plane_ptr = brw_indirect(4, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *is_negative;
+   struct brw_instruction *is_neg2;
+   struct brw_instruction *not_culled;
+   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
+   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
+   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
+   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
+   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
+
+   /* Note: init t0, t1 together:
+    */
+   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
+
+   brw_clip_init_planes(c);
+   brw_clip_init_clipmask(c);
+
+   /* -ve rhw workaround */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+	   brw_imm_ud(1<<20));
+   brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
+
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+#if 0
+	 /* dp = DP4(vtx->position, plane)
+	  */
+	 brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+
+	 /* if (IS_NEGATIVE(dp1))
+	  */
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+#else
+         #warning "disabled"
+#endif
+	 is_negative = brw_IF(p, BRW_EXECUTE_1);
+	 {
+	    brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+	    brw_math_invert(p, c->reg.t, c->reg.t);
+	    brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+	    brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+	    brw_MOV(p, c->reg.t1, c->reg.t);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 }
+	 is_negative = brw_ELSE(p, is_negative);
+	 {
+	    /* Coming back in.  We know that both cannot be negative
+	     * because the line would have been culled in that case.
+	     */
+
+	    /* If both are positive, do nothing */
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+             is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+             {
+		brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+		brw_math_invert(p, c->reg.t, c->reg.t);
+		brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+		brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+		brw_MOV(p, c->reg.t0, c->reg.t);
+		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	     }
+	     brw_ENDIF(p, is_neg2);
+	 }
+	 brw_ENDIF(p, is_negative);
+      }
+      brw_ENDIF(p, plane_active);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* while (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+
+   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
+   not_culled = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, FALSE);
+      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, FALSE);
+
+      brw_clip_emit_vue(c, newvtx0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+      brw_clip_emit_vue(c, newvtx1, 0, 1, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+   }
+   brw_ENDIF(p, not_culled);
+   brw_clip_kill_thread(c);
+}
+
+
+
+void brw_emit_line_clip( struct brw_clip_compile *c )
+{
+   brw_clip_line_alloc_regs(c);
+
+   if (c->key.do_flat_shading)
+      brw_clip_copy_colors(c, 0, 1);
+
+   clip_and_emit_line(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_clip_point.c b/src/gallium/drivers/i965simple/brw_clip_point.c
new file mode 100644
index 0000000000..6fce7210d1
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_point.c
@@ -0,0 +1,47 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+/* Point clipping, nothing to do?
+ */
+void brw_emit_point_clip( struct brw_clip_compile *c )
+{
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_clip_state.c b/src/gallium/drivers/i965simple/brw_clip_state.c
new file mode 100644
index 0000000000..8e78dd51be
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_state.c
@@ -0,0 +1,93 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+static void upload_clip_unit( struct brw_context *brw )
+{
+   struct brw_clip_unit_state clip;
+
+   memset(&clip, 0, sizeof(clip));
+
+   /* CACHE_NEW_CLIP_PROG */
+   clip.thread0.grf_reg_count =
+      align(brw->clip.prog_data->total_grf, 16) / 16 - 1;
+   clip.thread0.kernel_start_pointer = brw->clip.prog_gs_offset >> 6;
+   clip.thread3.urb_entry_read_length = brw->clip.prog_data->urb_read_length;
+   clip.thread3.const_urb_entry_read_length = brw->clip.prog_data->curb_read_length;
+   clip.clip5.clip_mode = brw->clip.prog_data->clip_mode;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   clip.thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
+
+   /* BRW_NEW_URB_FENCE */
+   clip.thread4.nr_urb_entries = brw->urb.nr_clip_entries; 
+   clip.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+   clip.thread4.max_threads = 1; /* 2 threads */
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      clip.thread4.stats_enable = 1; 
+
+   /* CONSTANT */
+   clip.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   clip.thread1.single_program_flow = 1;
+   clip.thread3.dispatch_grf_start_reg = 1;
+   clip.thread3.urb_entry_read_offset = 0;
+   clip.clip5.userclip_enable_flags = 0x7f;
+   clip.clip5.userclip_must_clip = 1;
+   clip.clip5.guard_band_enable = 0;
+   clip.clip5.viewport_z_clip_enable = 1;
+   clip.clip5.viewport_xy_clip_enable = 1;
+   clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
+   clip.clip5.api_mode = BRW_CLIP_API_OGL;   
+   clip.clip6.clipper_viewport_state_ptr = 0;
+   clip.viewport_xmin = -1;
+   clip.viewport_xmax = 1;
+   clip.viewport_ymin = -1;
+   clip.viewport_ymax = 1;
+
+   brw->clip.state_gs_offset = brw_cache_data( &brw->cache[BRW_CLIP_UNIT], &clip );
+}
+
+
+const struct brw_tracked_state brw_clip_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_CLIP_PROG
+   },
+   .update = upload_clip_unit
+};
diff --git a/src/gallium/drivers/i965simple/brw_clip_tri.c b/src/gallium/drivers/i965simple/brw_clip_tri.c
new file mode 100644
index 0000000000..c5da7b825e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_tri.c
@@ -0,0 +1,566 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+static struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmps( struct brw_clip_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
+			      unsigned nr_verts )
+{
+   unsigned i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   if (c->nr_attrs & 1) {
+      for (j = 0; j < 3; j++) {
+	 unsigned delta = c->nr_attrs*16 + 32;
+	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
+      }
+   }
+
+   c->reg.t          = brw_vec1_grf(i, 0);
+   c->reg.loopcount  = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_UD);
+   c->reg.nr_verts   = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
+   c->reg.planemask  = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dpPrev     = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp         = brw_vec1_grf(i, 4);
+   i++;
+
+   c->reg.inlist     = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.outlist    = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.freelist   = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+   if (c->key.do_unfilled) {
+      c->reg.dir     = brw_vec4_grf(i, 0);
+      c->reg.offset  = brw_vec4_grf(i, 4);
+      i++;
+      c->reg.tmp0    = brw_vec4_grf(i, 0);
+      c->reg.tmp1    = brw_vec4_grf(i, 4);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+   struct brw_instruction *is_rev;
+
+   /* Initial list of indices for incoming vertexes:
+    */
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+   /* XXX: Is there an easier way to do this?  Need to reverse every
+    * second tristrip element:  Can ignore sometimes?
+    */
+   is_rev = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
+   }
+   is_rev = brw_ELSE(p, is_rev);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
+   }
+   brw_ENDIF(p, is_rev);
+
+   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
+   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
+}
+
+
+
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_copy_colors(c, 1, 0);
+      brw_clip_copy_colors(c, 2, 0);
+   }
+   is_poly = brw_ELSE(p, is_poly);
+   {
+      brw_clip_copy_colors(c, 0, 2);
+      brw_clip_copy_colors(c, 1, 2);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+/* Use mesa's clipping algorithms, translated to GEN4 assembly.
+ */
+void brw_clip_tri( struct brw_clip_compile *c )
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx = brw_indirect(0, 0);
+   struct brw_indirect vtxPrev = brw_indirect(1, 0);
+   struct brw_indirect vtxOut = brw_indirect(2, 0);
+   struct brw_indirect plane_ptr = brw_indirect(3, 0);
+   struct brw_indirect inlist_ptr = brw_indirect(4, 0);
+   struct brw_indirect outlist_ptr = brw_indirect(5, 0);
+   struct brw_indirect freelist_ptr = brw_indirect(6, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *vertex_loop;
+   struct brw_instruction *next_test;
+   struct brw_instruction *prev_test;
+
+   brw_MOV(p, get_addr_reg(vtxPrev),     brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, get_addr_reg(plane_ptr),   brw_clip_plane0_address(c));
+   brw_MOV(p, get_addr_reg(inlist_ptr),  brw_address(c->reg.inlist));
+   brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+
+   brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
+
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 /* vtxOut = freelist_ptr++
+	  */
+	 brw_MOV(p, get_addr_reg(vtxOut),       get_addr_reg(freelist_ptr) );
+	 brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
+
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+	 brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+	 brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
+
+	 vertex_loop = brw_DO(p, BRW_EXECUTE_1);
+	 {
+	    /* vtx = *input_ptr;
+	     */
+	    brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
+
+	    /* IS_NEGATIVE(prev) */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	    brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	    prev_test = brw_IF(p, BRW_EXECUTE_1);
+	    {
+	       /* IS_POSITIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+
+		  /* Coming back in.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
+
+		  /* If (vtxOut == 0) vtxOut = vtxPrev
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, FALSE);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p, next_test);
+
+	    }
+	    prev_test = brw_ELSE(p, prev_test);
+	    {
+	       /* *outlist_ptr++ = vtxPrev;
+		* nr_verts++;
+		*/
+	       brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
+	       brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+	       brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+
+	       /* IS_NEGATIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+		  /* Going out of bounds.  Avoid division by zero as we
+		   * know dp != dpPrev from DIFFERENT_SIGNS, above.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
+
+		  /* If (vtxOut == 0) vtxOut = vtx
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, TRUE);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p, next_test);
+	    }
+	    brw_ENDIF(p, prev_test);
+
+	    /* vtxPrev = vtx;
+	     * inlist_ptr++;
+	     */
+	    brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
+	    brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
+
+	    /* while (--loopcount != 0)
+	     */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	    brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+	 }
+	 brw_WHILE(p, vertex_loop);
+
+	 /* vtxPrev = *(outlist_ptr-1)  OR: outlist[nr_verts-1]
+	  * inlist = outlist
+	  * inlist_ptr = &inlist[0]
+	  * outlist_ptr = &outlist[0]
+	  */
+	 brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
+	 brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
+	 brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
+	 brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
+	 brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+      }
+      brw_ENDIF(p, plane_active);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* nr_verts >= 3
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      c->reg.nr_verts,
+	      brw_imm_ud(3));
+
+      /* && (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop, *if_insn;
+
+   /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
+    */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+   brw_ADD(p,
+	   c->reg.loopcount,
+	   c->reg.nr_verts,
+	   brw_imm_d(-2));
+
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect v0 = brw_indirect(0, 0);
+      struct brw_indirect vptr = brw_indirect(1, 0);
+
+      brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_clip_emit_vue(c, v0, 1, 0, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_START));
+
+      brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_TRIFAN << 2));
+
+	 brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+
+      brw_clip_emit_vue(c, v0, 0, 1, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_END));
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+static void do_clip_tri( struct brw_clip_compile *c )
+{
+   brw_clip_init_planes(c);
+
+   brw_clip_tri(c);
+}
+
+
+static void maybe_do_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      do_clip_tri(c);
+   }
+   brw_ENDIF(p, do_clip);
+}
+
+static void brw_clip_test( struct brw_clip_compile *c )
+{
+#if 0
+    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+
+    struct brw_reg v0 = get_tmp(c);
+    struct brw_reg v1 = get_tmp(c);
+    struct brw_reg v2 = get_tmp(c);
+
+    struct brw_indirect vt0 = brw_indirect(0, 0);
+    struct brw_indirect vt1 = brw_indirect(1, 0);
+    struct brw_indirect vt2 = brw_indirect(2, 0);
+
+    struct brw_compile *p = &c->func;
+
+    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
+    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
+    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
+    brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS]));
+
+    /* test nearz, xmin, ymin plane */
+    brw_CMP(p, t1, BRW_CONDITIONAL_LE, negate(v0), get_element(v0, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_LE, negate(v1), get_element(v1, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_LE, negate(v2), get_element(v2, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* test farz, xmax, ymax plane */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, get_element(v0, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, get_element(v1, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, get_element(v2, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    release_tmps(c);
+#else
+         #warning "disabled"
+#endif
+}
+
+
+void brw_emit_tri_clip( struct brw_clip_compile *c )
+{
+   struct brw_instruction *neg_rhw;
+   struct brw_compile *p = &c->func;
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_clipmask(c);
+
+   /* if -ve rhw workaround bit is set,
+      do cliptest */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+	   brw_imm_ud(1<<20));
+   neg_rhw = brw_IF(p, BRW_EXECUTE_1);
+   {
+       brw_clip_test(c);
+   }
+   brw_ENDIF(p, neg_rhw);
+
+   /* Can't push into do_clip_tri because with polygon (or quad)
+    * flatshading, need to apply the flatshade here because we don't
+    * respect the PV when converting to trifan for emit:
+    */
+   if (c->key.do_flat_shading)
+      brw_clip_tri_flat_shade(c);
+
+   if (c->key.clip_mode == BRW_CLIPMODE_NORMAL)
+      do_clip_tri(c);
+   else
+      maybe_do_clip_tri(c);
+
+   brw_clip_tri_emit_polygon(c);
+
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_clip_unfilled.c b/src/gallium/drivers/i965simple/brw_clip_unfilled.c
new file mode 100644
index 0000000000..b774a76dd6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_unfilled.c
@@ -0,0 +1,477 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+/* This is performed against the original triangles, so no indirection
+ * required:
+BZZZT!
+ */
+static void compute_tri_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg e = c->reg.tmp0;
+   struct brw_reg f = c->reg.tmp1;
+   struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_HPOS]); 
+   struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset[VERT_RESULT_HPOS]); 
+   struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_HPOS]); 
+
+
+   /* Calculate the vectors of two edges of the triangle:
+    */
+   brw_ADD(p, e, v0, negate(v2)); 
+   brw_ADD(p, f, v1, negate(v2)); 
+
+   /* Take their crossproduct:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3),  brw_swizzle(f,2,0,1,3));
+   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
+}
+
+
+static void cull_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   unsigned conditional;
+
+   assert (!(c->key.fill_ccw == CLIP_CULL &&
+	     c->key.fill_cw == CLIP_CULL));
+
+   if (c->key.fill_ccw == CLIP_CULL)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+static void copy_bfc( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   unsigned conditional;
+
+   /* Do we have any colors to copy? 
+    */
+   if (!(c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0]) &&
+       !(c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1]))
+      return;
+
+   /* In some wierd degnerate cases we can end up testing the
+    * direction twice, once for culling and once for bfc copying.  Oh
+    * well, that's what you get for setting wierd GL state.
+    */
+   if (c->key.copy_bfc_ccw)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      unsigned i;
+
+      for (i = 0; i < 3; i++) {
+	 if (c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0])
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL0]),
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC0]));
+
+	 if (c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1])
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL1]),
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC1]));
+      }
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+
+/*
+  float iz	= 1.0 / dir.z;
+  float ac	= dir.x * iz;
+  float bc	= dir.y * iz;
+  offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
+  offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
+  offset *= MRD;
+*/
+static void compute_offset( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg off = c->reg.offset;
+   struct brw_reg dir = c->reg.dir;
+   
+   brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
+   brw_MUL(p, vec2(off), dir, get_element(off, 2));
+
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_GE,
+	   brw_abs(get_element(off, 0)), 
+	   brw_abs(get_element(off, 1)));
+
+   brw_SEL(p, vec1(off), brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   brw_MUL(p, vec1(off), off, brw_imm_f(c->key.offset_factor));
+   brw_ADD(p, vec1(off), off, brw_imm_f(c->key.offset_units));
+}
+
+
+static void merge_edgeflags( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   /* Get away with using reg.vertex because we know that this is not
+    * a _3DPRIM_TRISTRIP_REVERSE:
+    */
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
+      brw_MOV(p, byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
+      brw_MOV(p, byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+static void apply_one_offset( struct brw_clip_compile *c,
+			  struct brw_indirect vert )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg pos = deref_4f(vert, c->offset[VERT_RESULT_HPOS]);
+   struct brw_reg z = get_element(pos, 2);
+
+   brw_ADD(p, z, z, vec1(c->reg.offset));
+}
+
+
+
+/***********************************************************************
+ * Output clipped polygon as an unfilled primitive:
+ */
+static void emit_lines(struct brw_clip_compile *c,
+		       boolean do_offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_edge;
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v1 = brw_indirect(1, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+   struct brw_indirect v1ptr = brw_indirect(3, 0);
+
+   /* Need a seperate loop for offset:
+    */
+   if (do_offset) {
+      brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+      brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+	 brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+	    
+	 apply_one_offset(c, v0);
+	    
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+   }
+
+   /* v1ptr = &inlist[nr_verts]
+    * *v1ptr = v0
+    */
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw edge if edgeflag != 0 */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
+	      brw_imm_f(0));
+      draw_edge = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+	 brw_clip_emit_vue(c, v1, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_edge);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+static void emit_points(struct brw_clip_compile *c,
+			boolean do_offset )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_point;
+
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw if edgeflag != 0 
+       */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
+	      brw_imm_f(0));
+      draw_point = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (do_offset)
+	    apply_one_offset(c, v0);
+
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_point);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+
+
+
+
+static void emit_primitives( struct brw_clip_compile *c,
+			     unsigned mode, 
+			     boolean do_offset )
+{
+   switch (mode) {
+   case CLIP_FILL:
+      brw_clip_tri_emit_polygon(c);
+      break;
+
+   case CLIP_LINE:
+      emit_lines(c, do_offset);
+      break;
+
+   case CLIP_POINT:
+      emit_points(c, do_offset);
+      break;
+
+   case CLIP_CULL:
+      assert(0);
+      break;
+   }
+} 
+
+
+
+static void emit_unfilled_primitives( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+
+   /* Direction culling has already been done.
+    */
+   if (c->key.fill_ccw != c->key.fill_cw &&
+       c->key.fill_ccw != CLIP_CULL &&
+       c->key.fill_cw != CLIP_CULL)
+   {
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      get_element(c->reg.dir, 2),
+	      brw_imm_f(0));
+   
+      ccw = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+      }
+      ccw = brw_ELSE(p, ccw);
+      {
+	 emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+      }
+      brw_ENDIF(p, ccw);
+   }
+   else if (c->key.fill_cw != CLIP_CULL) {
+      emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+   }
+   else if (c->key.fill_ccw != CLIP_CULL) { 
+      emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+   }
+}
+
+
+
+
+static void check_nr_verts( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));      
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+
+void brw_emit_unfilled_clip( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+   
+
+   c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
+			(c->key.fill_ccw != c->key.fill_cw) ||
+			c->key.fill_ccw == CLIP_CULL ||
+			c->key.fill_cw == CLIP_CULL ||
+			c->key.copy_bfc_cw ||
+			c->key.copy_bfc_ccw);
+
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+
+   assert(c->offset[VERT_RESULT_EDGE]);
+
+   if (c->key.fill_ccw == CLIP_CULL &&
+       c->key.fill_cw == CLIP_CULL) {
+      brw_clip_kill_thread(c);
+      return;
+   }
+
+   merge_edgeflags(c);
+
+   /* Need to use the inlist indirection here: 
+    */
+   if (c->need_direction) 
+      compute_tri_direction(c);
+   
+   if (c->key.fill_ccw == CLIP_CULL ||
+       c->key.fill_cw == CLIP_CULL)
+      cull_direction(c);
+
+   if (c->key.offset_ccw ||
+       c->key.offset_cw)
+      compute_offset(c);
+
+   if (c->key.copy_bfc_ccw ||
+       c->key.copy_bfc_cw)
+      copy_bfc(c);
+
+   /* Need to do this whether we clip or not:
+    */
+   if (c->key.do_flat_shading)
+      brw_clip_tri_flat_shade(c);
+   
+   brw_clip_init_clipmask(c);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_init_planes(c);
+      brw_clip_tri(c);
+      check_nr_verts(c);
+   }
+   brw_ENDIF(p, do_clip);
+   
+   emit_unfilled_primitives(c);
+   brw_clip_kill_thread(c);
+}
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_clip_util.c b/src/gallium/drivers/i965simple/brw_clip_util.c
new file mode 100644
index 0000000000..6d58ceafff
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_util.c
@@ -0,0 +1,351 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+
+
+static struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+
+static struct brw_reg make_plane_ud(unsigned x, unsigned y, unsigned z, unsigned w)
+{
+   return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
+}
+
+
+void brw_clip_init_planes( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+
+   if (!c->key.nr_userclip) {
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
+   }
+}
+
+
+
+#define W 3
+
+/* Project 'pos' to screen space (or back again), overwrite with results:
+ */
+static void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
+{
+   struct brw_compile *p = &c->func;
+
+   /* calc rhw
+    */
+   brw_math_invert(p, get_element(pos, W), get_element(pos, W));
+
+   /* value.xyz *= value.rhw
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, brw_writemask(pos, TGSI_WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+}
+
+
+static void brw_clip_project_vertex( struct brw_clip_compile *c,
+				     struct brw_indirect vert_addr )
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+
+   /* Fixup position.  Extract from the original vertex and re-project
+    * to screen space:
+    */
+   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset[VERT_RESULT_HPOS]));
+   brw_clip_project_position(c, tmp);
+   brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp);
+
+   release_tmp(c, tmp);
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+
+/* Interpolate between two vertices and put the result into a0.0.
+ * Increment a0.0 accordingly.
+ */
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     boolean force_edgeflag)
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+   unsigned i;
+
+   /* Just copy the vertex header:
+    */
+   brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
+
+   /* Iterate over each attribute (could be done in pairs?)
+    */
+   for (i = 0; i < c->nr_attrs; i++) {
+      unsigned delta = i*16 + 32;
+
+      if (delta == c->offset[VERT_RESULT_EDGE]) {
+	 if (force_edgeflag)
+	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
+	 else
+	    brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
+      }
+      else {
+	 /* Interpolate:
+	  *
+	  *        New = attr0 + t*attr1 - t*attr0
+	  */
+	 brw_MUL(p,
+		 vec4(brw_null_reg()),
+		 deref_4f(v1_ptr, delta),
+		 t0);
+
+	 brw_MAC(p,
+		 tmp,
+		 negate(deref_4f(v0_ptr, delta)),
+		 t0);
+
+	 brw_ADD(p,
+		 deref_4f(dest_ptr, delta),
+		 deref_4f(v0_ptr, delta),
+		 tmp);
+      }
+   }
+
+   if (i & 1) {
+      unsigned delta = i*16 + 32;
+      brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
+   }
+
+   release_tmp(c, tmp);
+
+   /* Recreate the projected (NDC) coordinate in the new vertex
+    * header:
+    */
+   brw_clip_project_vertex(c, dest_ptr );
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+
+#define MAX_MRF 16
+
+void brw_clip_emit_vue(struct brw_clip_compile *c,
+		       struct brw_indirect vert,
+		       boolean allocate,
+		       boolean eot,
+		       unsigned header)
+{
+   struct brw_compile *p = &c->func;
+   unsigned start = c->last_mrf;
+
+   assert(!(allocate && eot));
+
+   /* Cycle through mrf regs - probably futile as we have to wait for
+    * the allocation response anyway.  Also, the order this function
+    * is invoked doesn't correspond to the order the instructions will
+    * be executed, so it won't have any effect in many cases.
+    */
+#if 0
+   if (start + c->nr_regs + 1 >= MAX_MRF)
+      start = 0;
+
+   c->last_mrf = start + c->nr_regs + 1;
+#endif
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy_from_indirect(p, brw_message_reg(start+1), vert, c->nr_regs);
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+
+   /* Send each vertex as a seperate write to the urb.  This
+    * is different to the concept in brw_sf_emit.c, where
+    * subsequent writes are used to build up a single urb
+    * entry.  Each of these writes instantiates a seperate
+    * urb entry - (I think... what about 'allocate'?)
+    */
+   brw_urb_WRITE(p,
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 start,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response_length */
+		 eot,		/* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_clip_kill_thread(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+
+   /* Send an empty message to kill the thread and release any
+    * allocated urb entry:
+    */
+   brw_urb_WRITE(p,
+		 retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 0,		/* allocate */
+		 0,		/* used */
+		 0, 		/* msg len */
+		 0, 		/* response len */
+		 1, 		/* eot */
+		 1,		/* writes complete */
+		 0,
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
+{
+   return brw_address(c->reg.fixed_planes);
+}
+
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
+{
+   if (c->key.nr_userclip) {
+      return brw_imm_uw(16);
+   }
+   else {
+      return brw_imm_uw(4);
+   }
+}
+
+
+/* If flatshading, distribute color from provoking vertex prior to
+ * clipping.
+ */
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   unsigned to, unsigned from )
+{
+#if 0
+   struct brw_compile *p = &c->func;
+
+   if (c->offset[VERT_RESULT_COL0])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL0]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL0]));
+
+   if (c->offset[VERT_RESULT_COL1])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL1]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL1]));
+
+   if (c->offset[VERT_RESULT_BFC0])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC0]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC0]));
+
+   if (c->offset[VERT_RESULT_BFC1])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC1]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC1]));
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
+
+   /* Shift so that lowest outcode bit is rightmost:
+    */
+   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
+
+   if (c->key.nr_userclip) {
+      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
+
+      /* Rearrange userclip outcodes so that they come directly after
+       * the fixed plane bits.
+       */
+      brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
+
+      release_tmp(c, tmp);
+   }
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_context.c b/src/gallium/drivers/i965simple/brw_context.c
new file mode 100644
index 0000000000..c74cbf8d73
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_context.c
@@ -0,0 +1,114 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_draw.h"
+#include "brw_vs.h"
+#include "brw_tex_layout.h"
+#include "brw_winsys.h"
+
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "pipe/p_screen.h"
+
+
+#ifndef BRW_DEBUG
+int BRW_DEBUG = (0);
+#endif
+
+
+static void brw_destroy(struct pipe_context *pipe)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   if(brw->winsys->destroy)
+      brw->winsys->destroy(brw->winsys);
+   
+   FREE(brw);
+}
+
+
+static void brw_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+                      unsigned clearValue)
+{
+   int x, y, w, h;
+   /* FIXME: corny... */
+
+   x = 0;
+   y = 0;
+   w = ps->width;
+   h = ps->height;
+
+   pipe->surface_fill(pipe, ps, x, y, w, h, clearValue);
+}
+
+
+struct pipe_context *brw_create(struct pipe_screen *screen,
+                                struct brw_winsys *brw_winsys,
+                                unsigned pci_id)
+{
+   struct brw_context *brw;
+
+   debug_printf("%s: creating brw_context with pci id 0x%x\n",
+                __FUNCTION__, pci_id);
+
+   brw = CALLOC_STRUCT(brw_context);
+   if (brw == NULL)
+      return NULL;
+
+   brw->winsys = brw_winsys;
+   brw->pipe.winsys = screen->winsys;
+   brw->pipe.screen = screen;
+
+   brw->pipe.destroy = brw_destroy;
+   brw->pipe.clear = brw_clear;
+
+   brw_init_surface_functions(brw);
+   brw_init_texture_functions(brw);
+   brw_init_state_functions(brw);
+   brw_init_flush_functions(brw);
+   brw_init_draw_functions( brw );
+
+
+   brw_init_state( brw );
+
+   brw->pci_id = pci_id;
+   brw->dirty = ~0;
+   brw->hardware_dirty = ~0;
+
+   memset(&brw->wm.bind, ~0, sizeof(brw->wm.bind));
+
+   return &brw->pipe;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_context.h b/src/gallium/drivers/i965simple/brw_context.h
new file mode 100644
index 0000000000..3079485180
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_context.h
@@ -0,0 +1,684 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRWCONTEXT_INC
+#define BRWCONTEXT_INC
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "tgsi/tgsi_scan.h"
+
+#include "brw_structs.h"
+#include "brw_winsys.h"
+
+
+/* Glossary:
+ *
+ * URB - uniform resource buffer.  A mid-sized buffer which is
+ * partitioned between the fixed function units and used for passing
+ * values (vertices, primitives, constants) between them.
+ *
+ * CURBE - constant URB entry.  An urb region (entry) used to hold
+ * constant values which the fixed function units can be instructed to
+ * preload into the GRF when spawining a thread.
+ *
+ * VUE - vertex URB entry.  An urb entry holding a vertex and usually
+ * a vertex header.  The header contains control information and
+ * things like primitive type, Begin/end flags and clip codes.
+ *
+ * PUE - primitive URB entry.  An urb entry produced by the setup (SF)
+ * unit holding rasterization and interpolation parameters.
+ *
+ * GRF - general register file.  One of several register files
+ * addressable by programmed threads.  The inputs (r0, payload, curbe,
+ * urb) of the thread are preloaded to this area before the thread is
+ * spawned.  The registers are individually 8 dwords wide and suitable
+ * for general usage.  Registers holding thread input values are not
+ * special and may be overwritten.
+ *
+ * MRF - message register file.  Threads communicate (and terminate)
+ * by sending messages.  Message parameters are placed in contigous
+ * MRF registers.  All program output is via these messages.  URB
+ * entries are populated by sending a message to the shared URB
+ * function containing the new data, together with a control word,
+ * often an unmodified copy of R0.
+ *
+ * R0 - GRF register 0.  Typically holds control information used when
+ * sending messages to other threads.
+ *
+ * EU or GEN4 EU: The name of the programmable subsystem of the
+ * i965 hardware.  Threads are executed by the EU, the registers
+ * described above are part of the EU architecture.
+ *
+ * Fixed function units:
+ *
+ * CS - Command streamer.  Notional first unit, little software
+ * interaction.  Holds the URB entries used for constant data, ie the
+ * CURBEs.
+ *
+ * VF/VS - Vertex Fetch / Vertex Shader.  The fixed function part of
+ * this unit is responsible for pulling vertices out of vertex buffers
+ * in vram and injecting them into the processing pipe as VUEs.  If
+ * enabled, it first passes them to a VS thread which is a good place
+ * for the driver to implement any active vertex shader.
+ *
+ * GS - Geometry Shader.  This corresponds to a new DX10 concept.  If
+ * enabled, incoming strips etc are passed to GS threads in individual
+ * line/triangle/point units.  The GS thread may perform arbitary
+ * computation and emit whatever primtives with whatever vertices it
+ * chooses.  This makes GS an excellent place to implement GL's
+ * unfilled polygon modes, though of course it is capable of much
+ * more.  Additionally, GS is used to translate away primitives not
+ * handled by latter units, including Quads and Lineloops.
+ *
+ * CS - Clipper.  Mesa's clipping algorithms are imported to run on
+ * this unit.  The fixed function part performs cliptesting against
+ * the 6 fixed clipplanes and makes descisions on whether or not the
+ * incoming primitive needs to be passed to a thread for clipping.
+ * User clip planes are handled via cooperation with the VS thread.
+ *
+ * SF - Strips Fans or Setup: Triangles are prepared for
+ * rasterization.  Interpolation coefficients are calculated.
+ * Flatshading and two-side lighting usually performed here.
+ *
+ * WM - Windower.  Interpolation of vertex attributes performed here.
+ * Fragment shader implemented here.  SIMD aspects of EU taken full
+ * advantage of, as pixels are processed in blocks of 16.
+ *
+ * CC - Color Calculator.  No EU threads associated with this unit.
+ * Handles blending and (presumably) depth and stencil testing.
+ */
+
+#define BRW_MAX_CURBE                    (32*16)
+
+struct brw_context;
+struct brw_winsys;
+
+
+/* Raised when we receive new state across the pipe interface:
+ */
+#define BRW_NEW_VIEWPORT                0x1
+#define BRW_NEW_RASTERIZER              0x2
+#define BRW_NEW_FS                      0x4
+#define BRW_NEW_BLEND                   0x8
+#define BRW_NEW_CLIP                    0x10
+#define BRW_NEW_SCISSOR                 0x20
+#define BRW_NEW_STIPPLE                 0x40
+#define BRW_NEW_FRAMEBUFFER             0x80
+#define BRW_NEW_ALPHA_TEST              0x100
+#define BRW_NEW_DEPTH_STENCIL           0x200
+#define BRW_NEW_SAMPLER                 0x400
+#define BRW_NEW_TEXTURE                 0x800
+#define BRW_NEW_CONSTANTS               0x1000
+#define BRW_NEW_VBO                     0x2000
+#define BRW_NEW_VS                      0x4000
+
+/* Raised for other internal events:
+ */
+#define BRW_NEW_URB_FENCE               0x10000
+#define BRW_NEW_PSP                     0x20000
+#define BRW_NEW_CURBE_OFFSETS           0x40000
+#define BRW_NEW_REDUCED_PRIMITIVE       0x80000
+#define BRW_NEW_PRIMITIVE               0x100000
+#define BRW_NEW_SCENE                 0x200000
+#define BRW_NEW_SF_LINKAGE              0x400000
+
+extern int BRW_DEBUG;
+
+#define DEBUG_TEXTURE	0x1
+#define DEBUG_STATE	0x2
+#define DEBUG_IOCTL	0x4
+#define DEBUG_PRIMS	0x8
+#define DEBUG_VERTS	0x10
+#define DEBUG_FALLBACKS	0x20
+#define DEBUG_VERBOSE	0x40
+#define DEBUG_DRI       0x80
+#define DEBUG_DMA       0x100
+#define DEBUG_SANITY    0x200
+#define DEBUG_SYNC      0x400
+#define DEBUG_SLEEP     0x800
+#define DEBUG_PIXEL     0x1000
+#define DEBUG_STATS     0x2000
+#define DEBUG_TILE      0x4000
+#define DEBUG_SINGLE_THREAD   0x8000
+#define DEBUG_WM        0x10000
+#define DEBUG_URB       0x20000
+#define DEBUG_VS        0x40000
+#define DEBUG_BATCH	0x80000
+#define DEBUG_BUFMGR	0x100000
+#define DEBUG_BLIT	0x200000
+#define DEBUG_REGION	0x400000
+#define DEBUG_MIPTREE	0x800000
+
+#define DBG(...) do {						\
+   if (BRW_DEBUG & FILE_DEBUG_FLAG)				\
+      debug_printf(__VA_ARGS__);				\
+} while(0)
+
+#define PRINT(...) do {						\
+   debug_printf(__VA_ARGS__);			                \
+} while(0)
+
+struct brw_state_flags {
+   unsigned cache;
+   unsigned brw;
+};
+
+
+struct brw_vertex_program {
+   struct pipe_shader_state program;
+   struct tgsi_shader_info info;
+   int id;
+};
+
+
+struct brw_fragment_program {
+   struct pipe_shader_state program;
+   struct tgsi_shader_info info;
+   
+   boolean UsesDepth; /* XXX add this to tgsi_shader_info? */
+   int id;
+};
+
+
+struct pipe_setup_linkage {
+   struct {
+      unsigned vp_output:5;
+      unsigned interp_mode:4;
+      unsigned bf_vp_output:5;
+   } fp_input[PIPE_MAX_SHADER_INPUTS];
+
+   unsigned fp_input_count:5;
+   unsigned max_vp_output:5;
+};
+   
+
+
+struct brw_texture {
+   struct pipe_texture base;
+
+   /* Derived from the above:
+    */
+   unsigned stride;
+   unsigned depth_pitch;          /* per-image on i945? */
+   unsigned total_nblocksy;
+
+   unsigned nr_images[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* Explicitly store the offset of each image for each cube face or
+    * depth value.  Pretty much have to accept that hardware formats
+    * are going to be so diverse that there is no unified way to
+    * compute the offsets of depth/cube images within a mipmap level,
+    * so have to store them as a lookup table:
+    */
+   unsigned *image_offset[PIPE_MAX_TEXTURE_LEVELS];   /**< array [depth] of offsets */
+
+   /* Includes image offset tables:
+    */
+   unsigned level_offset[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+};
+
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+
+struct brw_wm_prog_data {
+   unsigned curb_read_length;
+   unsigned urb_read_length;
+
+   unsigned first_curbe_grf;
+   unsigned total_grf;
+   unsigned total_scratch;
+
+   /* Internally generated constants for the CURBE.  These are loaded
+    * ahead of the data from the constant buffer.
+    */
+   const float internal_const[8];
+   unsigned nr_internal_consts;
+   unsigned max_const;
+
+   boolean error;
+};
+
+struct brw_sf_prog_data {
+   unsigned urb_read_length;
+   unsigned total_grf;
+
+   /* Each vertex may have upto 12 attributes, 4 components each,
+    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
+    * rows.
+    *
+    * Actually we use 4 for each, so call it 12 rows.
+    */
+   unsigned urb_entry_size;
+};
+
+struct brw_clip_prog_data {
+   unsigned curb_read_length;	/* user planes? */
+   unsigned clip_mode;
+   unsigned urb_read_length;
+   unsigned total_grf;
+};
+
+struct brw_gs_prog_data {
+   unsigned urb_read_length;
+   unsigned total_grf;
+};
+
+struct brw_vs_prog_data {
+   unsigned curb_read_length;
+   unsigned urb_read_length;
+   unsigned total_grf;
+   unsigned outputs_written;
+
+   unsigned inputs_read;
+
+   unsigned max_const;
+
+   float    imm_buf[PIPE_MAX_CONSTANT][4];
+   unsigned num_imm;
+   unsigned num_consts;
+
+   /* Used for calculating urb partitions:
+    */
+   unsigned urb_entry_size;
+};
+
+
+#define BRW_MAX_TEX_UNIT 8
+#define BRW_WM_MAX_SURF BRW_MAX_TEX_UNIT + 1
+
+/* Create a fixed sized struct for caching binding tables:
+ */
+struct brw_surface_binding_table {
+   unsigned surf_ss_offset[BRW_WM_MAX_SURF];
+};
+
+
+struct brw_cache;
+
+struct brw_mem_pool {
+   struct pipe_buffer *buffer;
+
+   unsigned size;
+   unsigned offset;		/* offset of first free byte */
+
+   struct brw_context *brw;
+};
+
+struct brw_cache_item {
+   unsigned hash;
+   unsigned key_size;		/* for variable-sized keys */
+   const void *key;
+
+   unsigned offset;		/* offset within pool's buffer */
+   unsigned data_size;
+
+   struct brw_cache_item *next;
+};
+
+
+
+struct brw_cache {
+   unsigned id;
+
+   const char *name;
+
+   struct brw_context *brw;
+   struct brw_mem_pool *pool;
+
+   struct brw_cache_item **items;
+   unsigned size, n_items;
+
+   unsigned key_size;		/* for fixed-size keys */
+   unsigned aux_size;
+
+   unsigned last_addr;			/* offset of active item */
+};
+
+
+
+
+/* Considered adding a member to this struct to document which flags
+ * an update might raise so that ordering of the state atoms can be
+ * checked or derived at runtime.  Dropped the idea in favor of having
+ * a debug mode where the state is monitored for flags which are
+ * raised that have already been tested against.
+ */
+struct brw_tracked_state {
+   struct brw_state_flags dirty;
+   void (*update)( struct brw_context *brw );
+};
+
+
+/* Flags for brw->state.cache.
+ */
+#define CACHE_NEW_CC_VP                  (1<<BRW_CC_VP)
+#define CACHE_NEW_CC_UNIT                (1<<BRW_CC_UNIT)
+#define CACHE_NEW_WM_PROG                (1<<BRW_WM_PROG)
+#define CACHE_NEW_SAMPLER_DEFAULT_COLOR  (1<<BRW_SAMPLER_DEFAULT_COLOR)
+#define CACHE_NEW_SAMPLER                (1<<BRW_SAMPLER)
+#define CACHE_NEW_WM_UNIT                (1<<BRW_WM_UNIT)
+#define CACHE_NEW_SF_PROG                (1<<BRW_SF_PROG)
+#define CACHE_NEW_SF_VP                  (1<<BRW_SF_VP)
+#define CACHE_NEW_SF_UNIT                (1<<BRW_SF_UNIT)
+#define CACHE_NEW_VS_UNIT                (1<<BRW_VS_UNIT)
+#define CACHE_NEW_VS_PROG                (1<<BRW_VS_PROG)
+#define CACHE_NEW_GS_UNIT                (1<<BRW_GS_UNIT)
+#define CACHE_NEW_GS_PROG                (1<<BRW_GS_PROG)
+#define CACHE_NEW_CLIP_VP                (1<<BRW_CLIP_VP)
+#define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
+#define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
+#define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
+#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
+
+
+
+
+enum brw_mempool_id {
+   BRW_GS_POOL,
+   BRW_SS_POOL,
+   BRW_MAX_POOL
+};
+
+
+struct brw_cached_batch_item {
+   struct header *header;
+   unsigned sz;
+   struct brw_cached_batch_item *next;
+};
+
+
+
+/* Protect against a future where PIPE_MAX_ATTRIBS > 32.  Wouldn't life
+ * be easier if C allowed arrays of packed elements?
+ */
+#define ATTRIB_BIT_DWORDS  ((PIPE_MAX_ATTRIBS+31)/32)
+
+
+
+
+struct brw_vertex_info {
+   unsigned varying;  /* varying:1[PIPE_MAX_ATTRIBS] */
+   unsigned sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[PIPE_MAX_ATTRIBS] */
+};
+
+
+
+
+
+struct brw_context
+{
+   struct pipe_context pipe;
+   struct brw_winsys *winsys;
+
+   unsigned primitive;
+   unsigned reduced_primitive;
+
+   boolean emit_state_always;
+
+   struct {
+      struct brw_state_flags dirty;
+   } state;
+
+
+   struct {
+      const struct pipe_blend_state         *Blend;
+      const struct pipe_depth_stencil_alpha_state *DepthStencil;
+      const struct pipe_poly_stipple        *PolygonStipple;
+      const struct pipe_rasterizer_state    *Raster;
+      const struct pipe_sampler_state       *Samplers[PIPE_MAX_SAMPLERS];
+      const struct brw_vertex_program       *VertexProgram;
+      const struct brw_fragment_program     *FragmentProgram;
+
+      struct pipe_clip_state          Clip;
+      struct pipe_blend_color         BlendColor;
+      struct pipe_scissor_state       Scissor;
+      struct pipe_viewport_state      Viewport;
+      struct pipe_framebuffer_state   FrameBuffer;
+
+      const struct pipe_constant_buffer *Constants[2];
+      const struct brw_texture          *Texture[PIPE_MAX_SAMPLERS];
+   } attribs;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+
+   struct brw_mem_pool pool[BRW_MAX_POOL];
+   struct brw_cache cache[BRW_MAX_CACHE];
+   struct brw_cached_batch_item *cached_batch_items;
+
+   struct {
+
+      /* Arrays with buffer objects to copy non-bufferobj arrays into
+       * for upload:
+       */
+      const struct pipe_vertex_buffer *vbo_array[PIPE_MAX_ATTRIBS];
+
+      struct brw_vertex_element_state inputs[PIPE_MAX_ATTRIBS];
+
+#define BRW_NR_UPLOAD_BUFS 17
+#define BRW_UPLOAD_INIT_SIZE (128*1024)
+
+      /* Summary of size and varying of active arrays, so we can check
+       * for changes to this state:
+       */
+      struct brw_vertex_info info;
+   } vb;
+
+
+   unsigned hardware_dirty;
+   unsigned dirty;
+   unsigned pci_id;
+   /* BRW_NEW_URB_ALLOCATIONS:
+    */
+   struct {
+      unsigned vsize;		/* vertex size plus header in urb registers */
+      unsigned csize;		/* constant buffer size in urb registers */
+      unsigned sfsize;		/* setup data size in urb registers */
+
+      boolean constrained;
+
+      unsigned nr_vs_entries;
+      unsigned nr_gs_entries;
+      unsigned nr_clip_entries;
+      unsigned nr_sf_entries;
+      unsigned nr_cs_entries;
+
+/*       unsigned vs_size; */
+/*       unsigned gs_size; */
+/*       unsigned clip_size; */
+/*       unsigned sf_size; */
+/*       unsigned cs_size; */
+
+      unsigned vs_start;
+      unsigned gs_start;
+      unsigned clip_start;
+      unsigned sf_start;
+      unsigned cs_start;
+   } urb;
+
+
+   /* BRW_NEW_CURBE_OFFSETS:
+    */
+   struct {
+      unsigned wm_start;
+      unsigned wm_size;
+      unsigned clip_start;
+      unsigned clip_size;
+      unsigned vs_start;
+      unsigned vs_size;
+      unsigned total_size;
+
+      unsigned gs_offset;
+
+      float *last_buf;
+      unsigned last_bufsz;
+   } curbe;
+
+   struct {
+      struct brw_vs_prog_data *prog_data;
+
+      unsigned prog_gs_offset;
+      unsigned state_gs_offset;
+   } vs;
+
+   struct {
+      struct brw_gs_prog_data *prog_data;
+
+      boolean prog_active;
+      unsigned prog_gs_offset;
+      unsigned state_gs_offset;
+   } gs;
+
+   struct {
+      struct brw_clip_prog_data *prog_data;
+
+      unsigned prog_gs_offset;
+      unsigned vp_gs_offset;
+      unsigned state_gs_offset;
+   } clip;
+
+
+   struct {
+      struct brw_sf_prog_data *prog_data;
+
+      struct pipe_setup_linkage linkage;
+
+      unsigned prog_gs_offset;
+      unsigned vp_gs_offset;
+      unsigned state_gs_offset;
+   } sf;
+
+   struct {
+      struct brw_wm_prog_data *prog_data;
+
+//      struct brw_wm_compiler *compile_data;
+
+
+      /**
+       * Array of sampler state uploaded at sampler_gs_offset of BRW_SAMPLER
+       * cache
+       */
+      struct brw_sampler_state sampler[BRW_MAX_TEX_UNIT];
+
+      unsigned render_surf;
+      unsigned nr_surfaces;
+
+      unsigned max_threads;
+      struct pipe_buffer *scratch_buffer;
+      unsigned scratch_buffer_size;
+
+      unsigned sampler_count;
+      unsigned sampler_gs_offset;
+
+      struct brw_surface_binding_table bind;
+      unsigned bind_ss_offset;
+
+      unsigned prog_gs_offset;
+      unsigned state_gs_offset;
+   } wm;
+
+
+   struct {
+      unsigned vp_gs_offset;
+      unsigned state_gs_offset;
+   } cc;
+
+
+   /* Used to give every program string a unique id
+    */
+   unsigned program_id;
+};
+
+
+#define BRW_PACKCOLOR8888(r,g,b,a)  ((r<<24) | (g<<16) | (b<<8) | a)
+
+
+/*======================================================================
+ * brw_vtbl.c
+ */
+void brw_do_flush( struct brw_context *brw,
+		   unsigned flags );
+
+
+/*======================================================================
+ * brw_state.c
+ */
+void brw_validate_state(struct brw_context *brw);
+void brw_init_state(struct brw_context *brw);
+void brw_destroy_state(struct brw_context *brw);
+
+
+/*======================================================================
+ * brw_tex.c
+ */
+void brwUpdateTextureState( struct brw_context *brw );
+
+
+/* brw_urb.c
+ */
+void brw_upload_urb_fence(struct brw_context *brw);
+
+void brw_upload_constant_buffer_state(struct brw_context *brw);
+
+void brw_init_surface_functions(struct brw_context *brw);
+void brw_init_state_functions(struct brw_context *brw);
+void brw_init_flush_functions(struct brw_context *brw);
+void brw_init_string_functions(struct brw_context *brw);
+
+/*======================================================================
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static inline struct brw_context *
+brw_context( struct pipe_context *ctx )
+{
+   return (struct brw_context *)ctx;
+}
+
+#endif
+
diff --git a/src/gallium/drivers/i965simple/brw_curbe.c b/src/gallium/drivers/i965simple/brw_curbe.c
new file mode 100644
index 0000000000..904cde8e30
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_curbe.c
@@ -0,0 +1,369 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "brw_batch.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+#include "pipe/p_state.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#define FILE_DEBUG_FLAG DEBUG_FALLBACKS
+
+/* Partition the CURBE between the various users of constant values:
+ */
+static void calculate_curbe_offsets( struct brw_context *brw )
+{
+   /* CACHE_NEW_WM_PROG */
+   unsigned nr_fp_regs = align(brw->wm.prog_data->max_const, 16);
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   unsigned nr_vp_regs = align(brw->vs.prog_data->max_const, 16);
+   unsigned nr_clip_regs = 0;
+   unsigned total_regs;
+
+#if 0
+   /* BRW_NEW_CLIP ? */
+   if (brw->attribs.Transform->ClipPlanesEnabled) {
+      unsigned nr_planes = 6 + brw_count_bits(brw->attribs.Transform->ClipPlanesEnabled);
+      nr_clip_regs = align(nr_planes * 4, 16);
+   }
+#endif
+
+
+   total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
+
+   /* This can happen - what to do?  Probably rather than falling
+    * back, the best thing to do is emit programs which code the
+    * constants as immediate values.  Could do this either as a static
+    * cap on WM and VS, or adaptively.
+    *
+    * Unfortunately, this is currently dependent on the results of the
+    * program generation process (in the case of wm), so this would
+    * introduce the need to re-generate programs in the event of a
+    * curbe allocation failure.
+    */
+   /* Max size is 32 - just large enough to
+    * hold the 128 parameters allowed by
+    * the fragment and vertex program
+    * api's.  It's not clear what happens
+    * when both VP and FP want to use 128
+    * parameters, though.
+    */
+   assert(total_regs <= 32);
+
+   /* Lazy resize:
+    */
+   if (nr_fp_regs > brw->curbe.wm_size ||
+       nr_vp_regs > brw->curbe.vs_size ||
+       nr_clip_regs != brw->curbe.clip_size ||
+       (total_regs < brw->curbe.total_size / 4 &&
+	brw->curbe.total_size > 16)) {
+
+      unsigned reg = 0;
+
+      /* Calculate a new layout:
+       */
+      reg = 0;
+      brw->curbe.wm_start = reg;
+      brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
+      brw->curbe.clip_start = reg;
+      brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
+      brw->curbe.vs_start = reg;
+      brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
+      brw->curbe.total_size = reg;
+
+#if 0
+      if (0)
+	 DBG("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+		      brw->curbe.wm_start,
+		      brw->curbe.wm_size,
+		      brw->curbe.clip_start,
+		      brw->curbe.clip_size,
+		      brw->curbe.vs_start,
+		      brw->curbe.vs_size );
+#endif
+
+      brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
+   }
+}
+
+
+const struct brw_tracked_state brw_curbe_offsets = {
+   .dirty = {
+      .brw  = (BRW_NEW_CLIP |
+	       BRW_NEW_VS),
+      .cache = CACHE_NEW_WM_PROG
+   },
+   .update = calculate_curbe_offsets
+};
+
+
+
+/* Define the number of curbes within CS's urb allocation.  Multiple
+ * urb entries -> multiple curbes.  These will be used by
+ * fixed-function hardware in a double-buffering scheme to avoid a
+ * pipeline stall each time the contents of the curbe is changed.
+ */
+void brw_upload_constant_buffer_state(struct brw_context *brw)
+{
+   struct brw_constant_buffer_state cbs;
+   memset(&cbs, 0, sizeof(cbs));
+
+   /* It appears that this is the state packet for the CS unit, ie. the
+    * urb entries detailed here are housed in the CS range from the
+    * URB_FENCE command.
+    */
+   cbs.header.opcode = CMD_CONST_BUFFER_STATE;
+   cbs.header.length = sizeof(cbs)/4 - 2;
+
+   /* BRW_NEW_URB_FENCE */
+   cbs.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
+   cbs.bits0.urb_entry_size = brw->urb.csize - 1;
+
+   assert(brw->urb.nr_cs_entries);
+   BRW_CACHED_BATCH_STRUCT(brw, &cbs);
+}
+
+
+static float fixed_plane[6][4] = {
+   { 0,    0,   -1, 1 },
+   { 0,    0,    1, 1 },
+   { 0,   -1,    0, 1 },
+   { 0,    1,    0, 1 },
+   {-1,    0,    0, 1 },
+   { 1,    0,    0, 1 }
+};
+
+/* Upload a new set of constants.  Too much variability to go into the
+ * cache mechanism, but maybe would benefit from a comparison against
+ * the current uploaded set of constants.
+ */
+static void upload_constant_buffer(struct brw_context *brw)
+{
+   struct brw_mem_pool *pool = &brw->pool[BRW_GS_POOL];
+   unsigned sz = brw->curbe.total_size;
+   unsigned bufsz = sz * sizeof(float);
+   float *buf;
+   unsigned i;
+
+
+   if (sz == 0) {
+      struct brw_constant_buffer cb;
+      cb.header.opcode = CMD_CONST_BUFFER;
+      cb.header.length = sizeof(cb)/4 - 2;
+      cb.header.valid = 0;
+      cb.bits0.buffer_length = 0;
+      cb.bits0.buffer_address = 0;
+      BRW_BATCH_STRUCT(brw, &cb);
+
+      if (brw->curbe.last_buf) {
+	 free(brw->curbe.last_buf);
+	 brw->curbe.last_buf = NULL;
+	 brw->curbe.last_bufsz  = 0;
+      }
+
+      return;
+   }
+
+   buf = (float *)malloc(bufsz);
+
+   memset(buf, 0, bufsz);
+
+   if (brw->curbe.wm_size) {
+      unsigned offset = brw->curbe.wm_start * 16;
+
+      /* First the constant buffer constants:
+       */
+      
+      /* Then any internally generated constants: 
+       */
+      for (i = 0; i < brw->wm.prog_data->nr_internal_consts; i++)
+	 buf[offset + i] = brw->wm.prog_data->internal_const[i];
+
+      assert(brw->wm.prog_data->max_const == 
+	     brw->wm.prog_data->nr_internal_consts);
+   }
+
+
+   /* The clipplanes are actually delivered to both CLIP and VS units.
+    * VS uses them to calculate the outcode bitmasks.
+    */
+   if (brw->curbe.clip_size) {
+      unsigned offset = brw->curbe.clip_start * 16;
+      unsigned j;
+
+      /* If any planes are going this way, send them all this way:
+       */
+      for (i = 0; i < 6; i++) {
+	 buf[offset + i * 4 + 0] = fixed_plane[i][0];
+	 buf[offset + i * 4 + 1] = fixed_plane[i][1];
+	 buf[offset + i * 4 + 2] = fixed_plane[i][2];
+	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
+      }
+
+      /* Clip planes: BRW_NEW_CLIP:
+       */
+      for (j = 0; j < brw->attribs.Clip.nr; j++) {
+	 buf[offset + i * 4 + 0] = brw->attribs.Clip.ucp[j][0];
+	 buf[offset + i * 4 + 1] = brw->attribs.Clip.ucp[j][1];
+	 buf[offset + i * 4 + 2] = brw->attribs.Clip.ucp[j][2];
+	 buf[offset + i * 4 + 3] = brw->attribs.Clip.ucp[j][3];
+	 i++;
+      }
+   }
+
+
+   if (brw->curbe.vs_size) {
+      unsigned offset = brw->curbe.vs_start * 16;
+      /*unsigned nr = vp->max_const;*/
+      const struct pipe_constant_buffer *cbuffer = brw->attribs.Constants[0];
+      struct pipe_winsys *ws = brw->pipe.winsys;
+      /* FIXME: buffer size is num_consts + num_immediates */
+      if (brw->vs.prog_data->num_consts) {
+         /* map the vertex constant buffer and copy to curbe: */
+         void *data = ws->buffer_map(ws, cbuffer->buffer, 0);
+         /* FIXME: this is wrong. the cbuffer->buffer->size currently
+          * represents size of consts + immediates. so if we'll
+          * have both we'll copy over the end of the buffer
+          * with the subsequent memcpy */
+         memcpy(&buf[offset], data, cbuffer->buffer->size);
+         ws->buffer_unmap(ws, cbuffer->buffer);
+         offset += cbuffer->buffer->size;
+      }
+      /*immediates*/
+      if (brw->vs.prog_data->num_imm) {
+         memcpy(&buf[offset], brw->vs.prog_data->imm_buf,
+                brw->vs.prog_data->num_imm * 4 * sizeof(float));
+      }
+   }
+
+   if (1) {
+      for (i = 0; i < sz; i+=4)
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
+
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+		   brw->curbe.last_buf, buf,
+		   bufsz, brw->curbe.last_bufsz,
+		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
+   }
+
+   if (brw->curbe.last_buf &&
+       bufsz == brw->curbe.last_bufsz &&
+       memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
+      free(buf);
+/*       return; */
+   }
+   else {
+      if (brw->curbe.last_buf)
+	 free(brw->curbe.last_buf);
+      brw->curbe.last_buf = buf;
+      brw->curbe.last_bufsz = bufsz;
+
+
+      if (!brw_pool_alloc(pool,
+			  bufsz,
+			  1 << 6,
+			  &brw->curbe.gs_offset)) {
+	 debug_printf("out of GS memory for curbe\n");
+	 assert(0);
+	 return;
+      }
+
+
+      /* Copy data to the buffer:
+       */
+      brw->winsys->buffer_subdata_typed(brw->winsys,
+					pool->buffer, 
+					brw->curbe.gs_offset, 
+					bufsz, 
+					buf,
+					BRW_CONSTANT_BUFFER );
+   }
+
+   /* TODO: only emit the constant_buffer packet when necessary, ie:
+      - contents have changed
+      - offset has changed
+      - hw requirements due to other packets emitted.
+   */
+   {
+      struct brw_constant_buffer cb;
+
+      memset(&cb, 0, sizeof(cb));
+
+      cb.header.opcode = CMD_CONST_BUFFER;
+      cb.header.length = sizeof(cb)/4 - 2;
+      cb.header.valid = 1;
+      cb.bits0.buffer_length = sz - 1;
+      cb.bits0.buffer_address = brw->curbe.gs_offset >> 6;
+
+      /* Because this provokes an action (ie copy the constants into the
+       * URB), it shouldn't be shortcircuited if identical to the
+       * previous time - because eg. the urb destination may have
+       * changed, or the urb contents different to last time.
+       *
+       * Note that the data referred to is actually copied internally,
+       * not just used in place according to passed pointer.
+       *
+       * It appears that the CS unit takes care of using each available
+       * URB entry (Const URB Entry == CURBE) in turn, and issuing
+       * flushes as necessary when doublebuffering of CURBEs isn't
+       * possible.
+       */
+      BRW_BATCH_STRUCT(brw, &cb);
+   }
+}
+
+/* This tracked state is unique in that the state it monitors varies
+ * dynamically depending on the parameters tracked by the fragment and
+ * vertex programs.  This is the template used as a starting point,
+ * each context will maintain a copy of this internally and update as
+ * required.
+ */
+const struct brw_tracked_state brw_constant_buffer = {
+   .dirty = {
+      .brw  = (BRW_NEW_CLIP |
+	       BRW_NEW_CONSTANTS |
+	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_CURBE_OFFSETS),
+      .cache = (CACHE_NEW_WM_PROG)
+   },
+   .update = upload_constant_buffer
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_defines.h b/src/gallium/drivers/i965simple/brw_defines.h
new file mode 100644
index 0000000000..9379a397f6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_defines.h
@@ -0,0 +1,852 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_DEFINES_H
+#define BRW_DEFINES_H
+
+/*
+ */
+#define MI_NOOP                              0x00
+#define MI_USER_INTERRUPT                    0x02
+#define MI_WAIT_FOR_EVENT                    0x03
+#define MI_FLUSH                             0x04
+#define MI_REPORT_HEAD                       0x07
+#define MI_ARB_ON_OFF                        0x08
+#define MI_BATCH_BUFFER_END                  0x0A
+#define MI_OVERLAY_FLIP                      0x11
+#define MI_LOAD_SCAN_LINES_INCL              0x12
+#define MI_LOAD_SCAN_LINES_EXCL              0x13
+#define MI_DISPLAY_BUFFER_INFO               0x14
+#define MI_SET_CONTEXT                       0x18
+#define MI_STORE_DATA_IMM                    0x20
+#define MI_STORE_DATA_INDEX                  0x21
+#define MI_LOAD_REGISTER_IMM                 0x22
+#define MI_STORE_REGISTER_MEM                0x24
+#define MI_BATCH_BUFFER_START                0x31
+
+#define MI_SYNCHRONOUS_FLIP                  0x0
+#define MI_ASYNCHRONOUS_FLIP                 0x1
+
+#define MI_BUFFER_SECURE                     0x0
+#define MI_BUFFER_NONSECURE                  0x1
+
+#define MI_ARBITRATE_AT_CHAIN_POINTS         0x0
+#define MI_ARBITRATE_BETWEEN_INSTS           0x1
+#define MI_NO_ARBITRATION                    0x3
+
+#define MI_CONDITION_CODE_WAIT_DISABLED      0x0
+#define MI_CONDITION_CODE_WAIT_0             0x1
+#define MI_CONDITION_CODE_WAIT_1             0x2
+#define MI_CONDITION_CODE_WAIT_2             0x3
+#define MI_CONDITION_CODE_WAIT_3             0x4
+#define MI_CONDITION_CODE_WAIT_4             0x5
+
+#define MI_DISPLAY_PIPE_A                    0x0
+#define MI_DISPLAY_PIPE_B                    0x1
+
+#define MI_DISPLAY_PLANE_A                   0x0
+#define MI_DISPLAY_PLANE_B                   0x1
+#define MI_DISPLAY_PLANE_C                   0x2
+
+#define MI_STANDARD_FLIP                                 0x0
+#define MI_ENQUEUE_FLIP_PERFORM_BASE_FRAME_NUMBER_LOAD   0x1
+#define MI_ENQUEUE_FLIP_TARGET_FRAME_NUMBER_RELATIVE     0x2
+#define MI_ENQUEUE_FLIP_ABSOLUTE_TARGET_FRAME_NUMBER     0x3
+
+#define MI_PHYSICAL_ADDRESS                  0x0
+#define MI_VIRTUAL_ADDRESS                   0x1
+
+#define MI_BUFFER_MEMORY_MAIN                0x0
+#define MI_BUFFER_MEMORY_GTT                 0x2
+#define MI_BUFFER_MEMORY_PER_PROCESS_GTT     0x3
+
+#define MI_FLIP_CONTINUE                     0x0
+#define MI_FLIP_ON                           0x1
+#define MI_FLIP_OFF                          0x2
+
+#define MI_UNTRUSTED_REGISTER_SPACE          0x0
+#define MI_TRUSTED_REGISTER_SPACE            0x1
+
+/* 3D state:
+ */
+#define _3DOP_3DSTATE_PIPELINED       0x0
+#define _3DOP_3DSTATE_NONPIPELINED    0x1
+#define _3DOP_3DCONTROL               0x2
+#define _3DOP_3DPRIMITIVE             0x3
+
+#define _3DSTATE_PIPELINED_POINTERS       0x00
+#define _3DSTATE_BINDING_TABLE_POINTERS   0x01
+#define _3DSTATE_VERTEX_BUFFERS           0x08
+#define _3DSTATE_VERTEX_ELEMENTS          0x09
+#define _3DSTATE_INDEX_BUFFER             0x0A
+#define _3DSTATE_VF_STATISTICS            0x0B
+#define _3DSTATE_DRAWING_RECTANGLE            0x00
+#define _3DSTATE_CONSTANT_COLOR               0x01
+#define _3DSTATE_SAMPLER_PALETTE_LOAD         0x02
+#define _3DSTATE_CHROMA_KEY                   0x04
+#define _3DSTATE_DEPTH_BUFFER                 0x05
+#define _3DSTATE_POLY_STIPPLE_OFFSET          0x06
+#define _3DSTATE_POLY_STIPPLE_PATTERN         0x07
+#define _3DSTATE_LINE_STIPPLE                 0x08
+#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP    0x09
+#define _3DCONTROL    0x00
+#define _3DPRIMITIVE  0x00
+
+#define PIPE_CONTROL_NOWRITE          0x00
+#define PIPE_CONTROL_WRITEIMMEDIATE   0x01
+#define PIPE_CONTROL_WRITEDEPTH       0x02
+#define PIPE_CONTROL_WRITETIMESTAMP   0x03
+
+#define PIPE_CONTROL_GTTWRITE_PROCESS_LOCAL 0x00
+#define PIPE_CONTROL_GTTWRITE_GLOBAL        0x01
+
+#define _3DPRIM_POINTLIST         0x01
+#define _3DPRIM_LINELIST          0x02
+#define _3DPRIM_LINESTRIP         0x03
+#define _3DPRIM_TRILIST           0x04
+#define _3DPRIM_TRISTRIP          0x05
+#define _3DPRIM_TRIFAN            0x06
+#define _3DPRIM_QUADLIST          0x07
+#define _3DPRIM_QUADSTRIP         0x08
+#define _3DPRIM_LINELIST_ADJ      0x09
+#define _3DPRIM_LINESTRIP_ADJ     0x0A
+#define _3DPRIM_TRILIST_ADJ       0x0B
+#define _3DPRIM_TRISTRIP_ADJ      0x0C
+#define _3DPRIM_TRISTRIP_REVERSE  0x0D
+#define _3DPRIM_POLYGON           0x0E
+#define _3DPRIM_RECTLIST          0x0F
+#define _3DPRIM_LINELOOP          0x10
+#define _3DPRIM_POINTLIST_BF      0x11
+#define _3DPRIM_LINESTRIP_CONT    0x12
+#define _3DPRIM_LINESTRIP_BF      0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE  0x15
+
+#define _3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL 0
+#define _3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     1
+
+#define BRW_ANISORATIO_2     0
+#define BRW_ANISORATIO_4     1
+#define BRW_ANISORATIO_6     2
+#define BRW_ANISORATIO_8     3
+#define BRW_ANISORATIO_10    4
+#define BRW_ANISORATIO_12    5
+#define BRW_ANISORATIO_14    6
+#define BRW_ANISORATIO_16    7
+
+#define BRW_BLENDFACTOR_ONE                 0x1
+#define BRW_BLENDFACTOR_SRC_COLOR           0x2
+#define BRW_BLENDFACTOR_SRC_ALPHA           0x3
+#define BRW_BLENDFACTOR_DST_ALPHA           0x4
+#define BRW_BLENDFACTOR_DST_COLOR           0x5
+#define BRW_BLENDFACTOR_SRC_ALPHA_SATURATE  0x6
+#define BRW_BLENDFACTOR_CONST_COLOR         0x7
+#define BRW_BLENDFACTOR_CONST_ALPHA         0x8
+#define BRW_BLENDFACTOR_SRC1_COLOR          0x9
+#define BRW_BLENDFACTOR_SRC1_ALPHA          0x0A
+#define BRW_BLENDFACTOR_ZERO                0x11
+#define BRW_BLENDFACTOR_INV_SRC_COLOR       0x12
+#define BRW_BLENDFACTOR_INV_SRC_ALPHA       0x13
+#define BRW_BLENDFACTOR_INV_DST_ALPHA       0x14
+#define BRW_BLENDFACTOR_INV_DST_COLOR       0x15
+#define BRW_BLENDFACTOR_INV_CONST_COLOR     0x17
+#define BRW_BLENDFACTOR_INV_CONST_ALPHA     0x18
+#define BRW_BLENDFACTOR_INV_SRC1_COLOR      0x19
+#define BRW_BLENDFACTOR_INV_SRC1_ALPHA      0x1A
+
+#define BRW_BLENDFUNCTION_ADD               0
+#define BRW_BLENDFUNCTION_SUBTRACT          1
+#define BRW_BLENDFUNCTION_REVERSE_SUBTRACT  2
+#define BRW_BLENDFUNCTION_MIN               3
+#define BRW_BLENDFUNCTION_MAX               4
+
+#define BRW_ALPHATEST_FORMAT_UNORM8         0
+#define BRW_ALPHATEST_FORMAT_FLOAT32        1
+
+#define BRW_CHROMAKEY_KILL_ON_ANY_MATCH  0
+#define BRW_CHROMAKEY_REPLACE_BLACK      1
+
+#define BRW_CLIP_API_OGL     0
+#define BRW_CLIP_API_DX      1
+
+#define BRW_CLIPMODE_NORMAL              0
+#define BRW_CLIPMODE_CLIP_ALL            1
+#define BRW_CLIPMODE_CLIP_NON_REJECTED   2
+#define BRW_CLIPMODE_REJECT_ALL          3
+#define BRW_CLIPMODE_ACCEPT_ALL          4
+
+#define BRW_CLIP_NDCSPACE     0
+#define BRW_CLIP_SCREENSPACE  1
+
+#define BRW_COMPAREFUNCTION_ALWAYS       0
+#define BRW_COMPAREFUNCTION_NEVER        1
+#define BRW_COMPAREFUNCTION_LESS         2
+#define BRW_COMPAREFUNCTION_EQUAL        3
+#define BRW_COMPAREFUNCTION_LEQUAL       4
+#define BRW_COMPAREFUNCTION_GREATER      5
+#define BRW_COMPAREFUNCTION_NOTEQUAL     6
+#define BRW_COMPAREFUNCTION_GEQUAL       7
+
+#define BRW_COVERAGE_PIXELS_HALF     0
+#define BRW_COVERAGE_PIXELS_1        1
+#define BRW_COVERAGE_PIXELS_2        2
+#define BRW_COVERAGE_PIXELS_4        3
+
+#define BRW_CULLMODE_BOTH        0
+#define BRW_CULLMODE_NONE        1
+#define BRW_CULLMODE_FRONT       2
+#define BRW_CULLMODE_BACK        3
+
+#define BRW_DEFAULTCOLOR_R8G8B8A8_UNORM      0
+#define BRW_DEFAULTCOLOR_R32G32B32A32_FLOAT  1
+
+#define BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT     0
+#define BRW_DEPTHFORMAT_D32_FLOAT                1
+#define BRW_DEPTHFORMAT_D24_UNORM_S8_UINT        2
+#define BRW_DEPTHFORMAT_D16_UNORM                5
+
+#define BRW_FLOATING_POINT_IEEE_754        0
+#define BRW_FLOATING_POINT_NON_IEEE_754    1
+
+#define BRW_FRONTWINDING_CW      0
+#define BRW_FRONTWINDING_CCW     1
+
+#define BRW_SPRITE_POINT_ENABLE  16
+
+#define BRW_INDEX_BYTE     0
+#define BRW_INDEX_WORD     1
+#define BRW_INDEX_DWORD    2
+
+#define BRW_LOGICOPFUNCTION_CLEAR            0
+#define BRW_LOGICOPFUNCTION_NOR              1
+#define BRW_LOGICOPFUNCTION_AND_INVERTED     2
+#define BRW_LOGICOPFUNCTION_COPY_INVERTED    3
+#define BRW_LOGICOPFUNCTION_AND_REVERSE      4
+#define BRW_LOGICOPFUNCTION_INVERT           5
+#define BRW_LOGICOPFUNCTION_XOR              6
+#define BRW_LOGICOPFUNCTION_NAND             7
+#define BRW_LOGICOPFUNCTION_AND              8
+#define BRW_LOGICOPFUNCTION_EQUIV            9
+#define BRW_LOGICOPFUNCTION_NOOP             10
+#define BRW_LOGICOPFUNCTION_OR_INVERTED      11
+#define BRW_LOGICOPFUNCTION_COPY             12
+#define BRW_LOGICOPFUNCTION_OR_REVERSE       13
+#define BRW_LOGICOPFUNCTION_OR               14
+#define BRW_LOGICOPFUNCTION_SET              15
+
+#define BRW_MAPFILTER_NEAREST        0x0
+#define BRW_MAPFILTER_LINEAR         0x1
+#define BRW_MAPFILTER_ANISOTROPIC    0x2
+
+#define BRW_MIPFILTER_NONE        0
+#define BRW_MIPFILTER_NEAREST     1
+#define BRW_MIPFILTER_LINEAR      3
+
+#define BRW_POLYGON_FRONT_FACING     0
+#define BRW_POLYGON_BACK_FACING      1
+
+#define BRW_PREFILTER_ALWAYS     0x0
+#define BRW_PREFILTER_NEVER      0x1
+#define BRW_PREFILTER_LESS       0x2
+#define BRW_PREFILTER_EQUAL      0x3
+#define BRW_PREFILTER_LEQUAL     0x4
+#define BRW_PREFILTER_GREATER    0x5
+#define BRW_PREFILTER_NOTEQUAL   0x6
+#define BRW_PREFILTER_GEQUAL     0x7
+
+#define BRW_PROVOKING_VERTEX_0    0
+#define BRW_PROVOKING_VERTEX_1    1
+#define BRW_PROVOKING_VERTEX_2    2
+
+#define BRW_RASTRULE_UPPER_LEFT  0
+#define BRW_RASTRULE_UPPER_RIGHT 1
+
+#define BRW_RENDERTARGET_CLAMPRANGE_UNORM    0
+#define BRW_RENDERTARGET_CLAMPRANGE_SNORM    1
+#define BRW_RENDERTARGET_CLAMPRANGE_FORMAT   2
+
+#define BRW_STENCILOP_KEEP               0
+#define BRW_STENCILOP_ZERO               1
+#define BRW_STENCILOP_REPLACE            2
+#define BRW_STENCILOP_INCRSAT            3
+#define BRW_STENCILOP_DECRSAT            4
+#define BRW_STENCILOP_INCR               5
+#define BRW_STENCILOP_DECR               6
+#define BRW_STENCILOP_INVERT             7
+
+#define BRW_SURFACE_MIPMAPLAYOUT_BELOW   0
+#define BRW_SURFACE_MIPMAPLAYOUT_RIGHT   1
+
+#define BRW_SURFACEFORMAT_R32G32B32A32_FLOAT             0x000
+#define BRW_SURFACEFORMAT_R32G32B32A32_SINT              0x001
+#define BRW_SURFACEFORMAT_R32G32B32A32_UINT              0x002
+#define BRW_SURFACEFORMAT_R32G32B32A32_UNORM             0x003
+#define BRW_SURFACEFORMAT_R32G32B32A32_SNORM             0x004
+#define BRW_SURFACEFORMAT_R64G64_FLOAT                   0x005
+#define BRW_SURFACEFORMAT_R32G32B32X32_FLOAT             0x006
+#define BRW_SURFACEFORMAT_R32G32B32A32_SSCALED           0x007
+#define BRW_SURFACEFORMAT_R32G32B32A32_USCALED           0x008
+#define BRW_SURFACEFORMAT_R32G32B32_FLOAT                0x040
+#define BRW_SURFACEFORMAT_R32G32B32_SINT                 0x041
+#define BRW_SURFACEFORMAT_R32G32B32_UINT                 0x042
+#define BRW_SURFACEFORMAT_R32G32B32_UNORM                0x043
+#define BRW_SURFACEFORMAT_R32G32B32_SNORM                0x044
+#define BRW_SURFACEFORMAT_R32G32B32_SSCALED              0x045
+#define BRW_SURFACEFORMAT_R32G32B32_USCALED              0x046
+#define BRW_SURFACEFORMAT_R16G16B16A16_UNORM             0x080
+#define BRW_SURFACEFORMAT_R16G16B16A16_SNORM             0x081
+#define BRW_SURFACEFORMAT_R16G16B16A16_SINT              0x082
+#define BRW_SURFACEFORMAT_R16G16B16A16_UINT              0x083
+#define BRW_SURFACEFORMAT_R16G16B16A16_FLOAT             0x084
+#define BRW_SURFACEFORMAT_R32G32_FLOAT                   0x085
+#define BRW_SURFACEFORMAT_R32G32_SINT                    0x086
+#define BRW_SURFACEFORMAT_R32G32_UINT                    0x087
+#define BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS       0x088
+#define BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT        0x089
+#define BRW_SURFACEFORMAT_L32A32_FLOAT                   0x08A
+#define BRW_SURFACEFORMAT_R32G32_UNORM                   0x08B
+#define BRW_SURFACEFORMAT_R32G32_SNORM                   0x08C
+#define BRW_SURFACEFORMAT_R64_FLOAT                      0x08D
+#define BRW_SURFACEFORMAT_R16G16B16X16_UNORM             0x08E
+#define BRW_SURFACEFORMAT_R16G16B16X16_FLOAT             0x08F
+#define BRW_SURFACEFORMAT_A32X32_FLOAT                   0x090
+#define BRW_SURFACEFORMAT_L32X32_FLOAT                   0x091
+#define BRW_SURFACEFORMAT_I32X32_FLOAT                   0x092
+#define BRW_SURFACEFORMAT_R16G16B16A16_SSCALED           0x093
+#define BRW_SURFACEFORMAT_R16G16B16A16_USCALED           0x094
+#define BRW_SURFACEFORMAT_R32G32_SSCALED                 0x095
+#define BRW_SURFACEFORMAT_R32G32_USCALED                 0x096
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM                 0x0C0
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB            0x0C1
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM              0x0C2
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB         0x0C3
+#define BRW_SURFACEFORMAT_R10G10B10A2_UINT               0x0C4
+#define BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM       0x0C5
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM                 0x0C7
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB            0x0C8
+#define BRW_SURFACEFORMAT_R8G8B8A8_SNORM                 0x0C9
+#define BRW_SURFACEFORMAT_R8G8B8A8_SINT                  0x0CA
+#define BRW_SURFACEFORMAT_R8G8B8A8_UINT                  0x0CB
+#define BRW_SURFACEFORMAT_R16G16_UNORM                   0x0CC
+#define BRW_SURFACEFORMAT_R16G16_SNORM                   0x0CD
+#define BRW_SURFACEFORMAT_R16G16_SINT                    0x0CE
+#define BRW_SURFACEFORMAT_R16G16_UINT                    0x0CF
+#define BRW_SURFACEFORMAT_R16G16_FLOAT                   0x0D0
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM              0x0D1
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB         0x0D2
+#define BRW_SURFACEFORMAT_R11G11B10_FLOAT                0x0D3
+#define BRW_SURFACEFORMAT_R32_SINT                       0x0D6
+#define BRW_SURFACEFORMAT_R32_UINT                       0x0D7
+#define BRW_SURFACEFORMAT_R32_FLOAT                      0x0D8
+#define BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS          0x0D9
+#define BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT           0x0DA
+#define BRW_SURFACEFORMAT_L16A16_UNORM                   0x0DF
+#define BRW_SURFACEFORMAT_I24X8_UNORM                    0x0E0
+#define BRW_SURFACEFORMAT_L24X8_UNORM                    0x0E1
+#define BRW_SURFACEFORMAT_A24X8_UNORM                    0x0E2
+#define BRW_SURFACEFORMAT_I32_FLOAT                      0x0E3
+#define BRW_SURFACEFORMAT_L32_FLOAT                      0x0E4
+#define BRW_SURFACEFORMAT_A32_FLOAT                      0x0E5
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM                 0x0E9
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB            0x0EA
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM                 0x0EB
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB            0x0EC
+#define BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP             0x0ED
+#define BRW_SURFACEFORMAT_B10G10R10X2_UNORM              0x0EE
+#define BRW_SURFACEFORMAT_L16A16_FLOAT                   0x0F0
+#define BRW_SURFACEFORMAT_R32_UNORM                      0x0F1
+#define BRW_SURFACEFORMAT_R32_SNORM                      0x0F2
+#define BRW_SURFACEFORMAT_R10G10B10X2_USCALED            0x0F3
+#define BRW_SURFACEFORMAT_R8G8B8A8_SSCALED               0x0F4
+#define BRW_SURFACEFORMAT_R8G8B8A8_USCALED               0x0F5
+#define BRW_SURFACEFORMAT_R16G16_SSCALED                 0x0F6
+#define BRW_SURFACEFORMAT_R16G16_USCALED                 0x0F7
+#define BRW_SURFACEFORMAT_R32_SSCALED                    0x0F8
+#define BRW_SURFACEFORMAT_R32_USCALED                    0x0F9
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM                   0x100
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB              0x101
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM                 0x102
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB            0x103
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM                 0x104
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB            0x105
+#define BRW_SURFACEFORMAT_R8G8_UNORM                     0x106
+#define BRW_SURFACEFORMAT_R8G8_SNORM                     0x107
+#define BRW_SURFACEFORMAT_R8G8_SINT                      0x108
+#define BRW_SURFACEFORMAT_R8G8_UINT                      0x109
+#define BRW_SURFACEFORMAT_R16_UNORM                      0x10A
+#define BRW_SURFACEFORMAT_R16_SNORM                      0x10B
+#define BRW_SURFACEFORMAT_R16_SINT                       0x10C
+#define BRW_SURFACEFORMAT_R16_UINT                       0x10D
+#define BRW_SURFACEFORMAT_R16_FLOAT                      0x10E
+#define BRW_SURFACEFORMAT_I16_UNORM                      0x111
+#define BRW_SURFACEFORMAT_L16_UNORM                      0x112
+#define BRW_SURFACEFORMAT_A16_UNORM                      0x113
+#define BRW_SURFACEFORMAT_L8A8_UNORM                     0x114
+#define BRW_SURFACEFORMAT_I16_FLOAT                      0x115
+#define BRW_SURFACEFORMAT_L16_FLOAT                      0x116
+#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117
+#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
+#define BRW_SURFACEFORMAT_R8G8_SSCALED                   0x11C
+#define BRW_SURFACEFORMAT_R8G8_USCALED                   0x11D
+#define BRW_SURFACEFORMAT_R16_SSCALED                    0x11E
+#define BRW_SURFACEFORMAT_R16_USCALED                    0x11F
+#define BRW_SURFACEFORMAT_R8_UNORM                       0x140
+#define BRW_SURFACEFORMAT_R8_SNORM                       0x141
+#define BRW_SURFACEFORMAT_R8_SINT                        0x142
+#define BRW_SURFACEFORMAT_R8_UINT                        0x143
+#define BRW_SURFACEFORMAT_A8_UNORM                       0x144
+#define BRW_SURFACEFORMAT_I8_UNORM                       0x145
+#define BRW_SURFACEFORMAT_L8_UNORM                       0x146
+#define BRW_SURFACEFORMAT_P4A4_UNORM                     0x147
+#define BRW_SURFACEFORMAT_A4P4_UNORM                     0x148
+#define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
+#define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
+#define BRW_SURFACEFORMAT_R1_UINT                        0x181
+#define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183
+#define BRW_SURFACEFORMAT_BC1_UNORM                      0x186
+#define BRW_SURFACEFORMAT_BC2_UNORM                      0x187
+#define BRW_SURFACEFORMAT_BC3_UNORM                      0x188
+#define BRW_SURFACEFORMAT_BC4_UNORM                      0x189
+#define BRW_SURFACEFORMAT_BC5_UNORM                      0x18A
+#define BRW_SURFACEFORMAT_BC1_UNORM_SRGB                 0x18B
+#define BRW_SURFACEFORMAT_BC2_UNORM_SRGB                 0x18C
+#define BRW_SURFACEFORMAT_BC3_UNORM_SRGB                 0x18D
+#define BRW_SURFACEFORMAT_MONO8                          0x18E
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUV                   0x18F
+#define BRW_SURFACEFORMAT_YCRCB_SWAPY                    0x190
+#define BRW_SURFACEFORMAT_DXT1_RGB                       0x191
+#define BRW_SURFACEFORMAT_FXT1                           0x192
+#define BRW_SURFACEFORMAT_R8G8B8_UNORM                   0x193
+#define BRW_SURFACEFORMAT_R8G8B8_SNORM                   0x194
+#define BRW_SURFACEFORMAT_R8G8B8_SSCALED                 0x195
+#define BRW_SURFACEFORMAT_R8G8B8_USCALED                 0x196
+#define BRW_SURFACEFORMAT_R64G64B64A64_FLOAT             0x197
+#define BRW_SURFACEFORMAT_R64G64B64_FLOAT                0x198
+#define BRW_SURFACEFORMAT_BC4_SNORM                      0x199
+#define BRW_SURFACEFORMAT_BC5_SNORM                      0x19A
+#define BRW_SURFACEFORMAT_R16G16B16_UNORM                0x19C
+#define BRW_SURFACEFORMAT_R16G16B16_SNORM                0x19D
+#define BRW_SURFACEFORMAT_R16G16B16_SSCALED              0x19E
+#define BRW_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+
+#define BRW_SURFACERETURNFORMAT_FLOAT32  0
+#define BRW_SURFACERETURNFORMAT_S1       1
+
+#define BRW_SURFACE_1D      0
+#define BRW_SURFACE_2D      1
+#define BRW_SURFACE_3D      2
+#define BRW_SURFACE_CUBE    3
+#define BRW_SURFACE_BUFFER  4
+#define BRW_SURFACE_NULL    7
+
+#define BRW_TEXCOORDMODE_WRAP            0
+#define BRW_TEXCOORDMODE_MIRROR          1
+#define BRW_TEXCOORDMODE_CLAMP           2
+#define BRW_TEXCOORDMODE_CUBE            3
+#define BRW_TEXCOORDMODE_CLAMP_BORDER    4
+#define BRW_TEXCOORDMODE_MIRROR_ONCE     5
+
+#define BRW_THREAD_PRIORITY_NORMAL   0
+#define BRW_THREAD_PRIORITY_HIGH     1
+
+#define BRW_TILEWALK_XMAJOR                 0
+#define BRW_TILEWALK_YMAJOR                 1
+
+#define BRW_VERTEX_SUBPIXEL_PRECISION_8BITS  0
+#define BRW_VERTEX_SUBPIXEL_PRECISION_4BITS  1
+
+#define BRW_VERTEXBUFFER_ACCESS_VERTEXDATA     0
+#define BRW_VERTEXBUFFER_ACCESS_INSTANCEDATA   1
+
+#define BRW_VFCOMPONENT_NOSTORE      0
+#define BRW_VFCOMPONENT_STORE_SRC    1
+#define BRW_VFCOMPONENT_STORE_0      2
+#define BRW_VFCOMPONENT_STORE_1_FLT  3
+#define BRW_VFCOMPONENT_STORE_1_INT  4
+#define BRW_VFCOMPONENT_STORE_VID    5
+#define BRW_VFCOMPONENT_STORE_IID    6
+#define BRW_VFCOMPONENT_STORE_PID    7
+
+
+
+/* Execution Unit (EU) defines
+ */
+
+#define BRW_ALIGN_1   0
+#define BRW_ALIGN_16  1
+
+#define BRW_ADDRESS_DIRECT                        0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define BRW_CHANNEL_X     0
+#define BRW_CHANNEL_Y     1
+#define BRW_CHANNEL_Z     2
+#define BRW_CHANNEL_W     3
+
+#define BRW_COMPRESSION_NONE          0
+#define BRW_COMPRESSION_2NDHALF       1
+#define BRW_COMPRESSION_COMPRESSED    2
+
+#define BRW_CONDITIONAL_NONE  0
+#define BRW_CONDITIONAL_Z     1
+#define BRW_CONDITIONAL_NZ    2
+#define BRW_CONDITIONAL_EQ    1	/* Z */
+#define BRW_CONDITIONAL_NEQ   2	/* NZ */
+#define BRW_CONDITIONAL_G     3
+#define BRW_CONDITIONAL_GE    4
+#define BRW_CONDITIONAL_L     5
+#define BRW_CONDITIONAL_LE    6
+#define BRW_CONDITIONAL_C     7
+#define BRW_CONDITIONAL_O     8
+
+#define BRW_DEBUG_NONE        0
+#define BRW_DEBUG_BREAKPOINT  1
+
+#define BRW_DEPENDENCY_NORMAL         0
+#define BRW_DEPENDENCY_NOTCLEARED     1
+#define BRW_DEPENDENCY_NOTCHECKED     2
+#define BRW_DEPENDENCY_DISABLE        3
+
+#define BRW_EXECUTE_1     0
+#define BRW_EXECUTE_2     1
+#define BRW_EXECUTE_4     2
+#define BRW_EXECUTE_8     3
+#define BRW_EXECUTE_16    4
+#define BRW_EXECUTE_32    5
+
+#define BRW_HORIZONTAL_STRIDE_0   0
+#define BRW_HORIZONTAL_STRIDE_1   1
+#define BRW_HORIZONTAL_STRIDE_2   2
+#define BRW_HORIZONTAL_STRIDE_4   3
+
+#define BRW_INSTRUCTION_NORMAL    0
+#define BRW_INSTRUCTION_SATURATE  1
+
+#define BRW_MASK_ENABLE   0
+#define BRW_MASK_DISABLE  1
+
+#define BRW_OPCODE_MOV        1
+#define BRW_OPCODE_SEL        2
+#define BRW_OPCODE_NOT        4
+#define BRW_OPCODE_AND        5
+#define BRW_OPCODE_OR         6
+#define BRW_OPCODE_XOR        7
+#define BRW_OPCODE_SHR        8
+#define BRW_OPCODE_SHL        9
+#define BRW_OPCODE_RSR        10
+#define BRW_OPCODE_RSL        11
+#define BRW_OPCODE_ASR        12
+#define BRW_OPCODE_CMP        16
+#define BRW_OPCODE_JMPI       32
+#define BRW_OPCODE_IF         34
+#define BRW_OPCODE_IFF        35
+#define BRW_OPCODE_ELSE       36
+#define BRW_OPCODE_ENDIF      37
+#define BRW_OPCODE_DO         38
+#define BRW_OPCODE_WHILE      39
+#define BRW_OPCODE_BREAK      40
+#define BRW_OPCODE_CONTINUE   41
+#define BRW_OPCODE_HALT       42
+#define BRW_OPCODE_MSAVE      44
+#define BRW_OPCODE_MRESTORE   45
+#define BRW_OPCODE_PUSH       46
+#define BRW_OPCODE_POP        47
+#define BRW_OPCODE_WAIT       48
+#define BRW_OPCODE_SEND       49
+#define BRW_OPCODE_ADD        64
+#define BRW_OPCODE_MUL        65
+#define BRW_OPCODE_AVG        66
+#define BRW_OPCODE_FRC        67
+#define BRW_OPCODE_RNDU       68
+#define BRW_OPCODE_RNDD       69
+#define BRW_OPCODE_RNDE       70
+#define BRW_OPCODE_RNDZ       71
+#define BRW_OPCODE_MAC        72
+#define BRW_OPCODE_MACH       73
+#define BRW_OPCODE_LZD        74
+#define BRW_OPCODE_SAD2       80
+#define BRW_OPCODE_SADA2      81
+#define BRW_OPCODE_DP4        84
+#define BRW_OPCODE_DPH        85
+#define BRW_OPCODE_DP3        86
+#define BRW_OPCODE_DP2        87
+#define BRW_OPCODE_DPA2       88
+#define BRW_OPCODE_LINE       89
+#define BRW_OPCODE_NOP        126
+
+#define BRW_PREDICATE_NONE             0
+#define BRW_PREDICATE_NORMAL           1
+#define BRW_PREDICATE_ALIGN1_ANYV             2
+#define BRW_PREDICATE_ALIGN1_ALLV             3
+#define BRW_PREDICATE_ALIGN1_ANY2H            4
+#define BRW_PREDICATE_ALIGN1_ALL2H            5
+#define BRW_PREDICATE_ALIGN1_ANY4H            6
+#define BRW_PREDICATE_ALIGN1_ALL4H            7
+#define BRW_PREDICATE_ALIGN1_ANY8H            8
+#define BRW_PREDICATE_ALIGN1_ALL8H            9
+#define BRW_PREDICATE_ALIGN1_ANY16H           10
+#define BRW_PREDICATE_ALIGN1_ALL16H           11
+#define BRW_PREDICATE_ALIGN16_REPLICATE_X     2
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Y     3
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Z     4
+#define BRW_PREDICATE_ALIGN16_REPLICATE_W     5
+#define BRW_PREDICATE_ALIGN16_ANY4H           6
+#define BRW_PREDICATE_ALIGN16_ALL4H           7
+
+#define BRW_ARCHITECTURE_REGISTER_FILE    0
+#define BRW_GENERAL_REGISTER_FILE         1
+#define BRW_MESSAGE_REGISTER_FILE         2
+#define BRW_IMMEDIATE_VALUE               3
+
+#define BRW_REGISTER_TYPE_UD  0
+#define BRW_REGISTER_TYPE_D   1
+#define BRW_REGISTER_TYPE_UW  2
+#define BRW_REGISTER_TYPE_W   3
+#define BRW_REGISTER_TYPE_UB  4
+#define BRW_REGISTER_TYPE_B   5
+#define BRW_REGISTER_TYPE_VF  5	/* packed float vector, immediates only? */
+#define BRW_REGISTER_TYPE_HF  6
+#define BRW_REGISTER_TYPE_V   6	/* packed int vector, immediates only, uword dest only */
+#define BRW_REGISTER_TYPE_F   7
+
+#define BRW_ARF_NULL                  0x00
+#define BRW_ARF_ADDRESS               0x10
+#define BRW_ARF_ACCUMULATOR           0x20
+#define BRW_ARF_FLAG                  0x30
+#define BRW_ARF_MASK                  0x40
+#define BRW_ARF_MASK_STACK            0x50
+#define BRW_ARF_MASK_STACK_DEPTH      0x60
+#define BRW_ARF_STATE                 0x70
+#define BRW_ARF_CONTROL               0x80
+#define BRW_ARF_NOTIFICATION_COUNT    0x90
+#define BRW_ARF_IP                    0xA0
+
+#define BRW_AMASK   0
+#define BRW_IMASK   1
+#define BRW_LMASK   2
+#define BRW_CMASK   3
+
+
+
+#define BRW_THREAD_NORMAL     0
+#define BRW_THREAD_ATOMIC     1
+#define BRW_THREAD_SWITCH     2
+
+#define BRW_VERTICAL_STRIDE_0                 0
+#define BRW_VERTICAL_STRIDE_1                 1
+#define BRW_VERTICAL_STRIDE_2                 2
+#define BRW_VERTICAL_STRIDE_4                 3
+#define BRW_VERTICAL_STRIDE_8                 4
+#define BRW_VERTICAL_STRIDE_16                5
+#define BRW_VERTICAL_STRIDE_32                6
+#define BRW_VERTICAL_STRIDE_64                7
+#define BRW_VERTICAL_STRIDE_128               8
+#define BRW_VERTICAL_STRIDE_256               9
+#define BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
+
+#define BRW_WIDTH_1       0
+#define BRW_WIDTH_2       1
+#define BRW_WIDTH_4       2
+#define BRW_WIDTH_8       3
+#define BRW_WIDTH_16      4
+
+#define BRW_STATELESS_BUFFER_BOUNDARY_1K      0
+#define BRW_STATELESS_BUFFER_BOUNDARY_2K      1
+#define BRW_STATELESS_BUFFER_BOUNDARY_4K      2
+#define BRW_STATELESS_BUFFER_BOUNDARY_8K      3
+#define BRW_STATELESS_BUFFER_BOUNDARY_16K     4
+#define BRW_STATELESS_BUFFER_BOUNDARY_32K     5
+#define BRW_STATELESS_BUFFER_BOUNDARY_64K     6
+#define BRW_STATELESS_BUFFER_BOUNDARY_128K    7
+#define BRW_STATELESS_BUFFER_BOUNDARY_256K    8
+#define BRW_STATELESS_BUFFER_BOUNDARY_512K    9
+#define BRW_STATELESS_BUFFER_BOUNDARY_1M      10
+#define BRW_STATELESS_BUFFER_BOUNDARY_2M      11
+
+#define BRW_POLYGON_FACING_FRONT      0
+#define BRW_POLYGON_FACING_BACK       1
+
+#define BRW_MESSAGE_TARGET_NULL               0
+#define BRW_MESSAGE_TARGET_MATH               1
+#define BRW_MESSAGE_TARGET_SAMPLER            2
+#define BRW_MESSAGE_TARGET_GATEWAY            3
+#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
+#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
+#define BRW_MESSAGE_TARGET_URB                6
+#define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO             2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
+
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_BLOCK_WRITE                2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
+
+#define BRW_MATH_FUNCTION_INV                              1
+#define BRW_MATH_FUNCTION_LOG                              2
+#define BRW_MATH_FUNCTION_EXP                              3
+#define BRW_MATH_FUNCTION_SQRT                             4
+#define BRW_MATH_FUNCTION_RSQ                              5
+#define BRW_MATH_FUNCTION_SIN                              6 /* was 7 */
+#define BRW_MATH_FUNCTION_COS                              7 /* was 8 */
+#define BRW_MATH_FUNCTION_SINCOS                           8 /* was 6 */
+#define BRW_MATH_FUNCTION_TAN                              9
+#define BRW_MATH_FUNCTION_POW                              10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
+
+#define BRW_MATH_INTEGER_UNSIGNED     0
+#define BRW_MATH_INTEGER_SIGNED       1
+
+#define BRW_MATH_PRECISION_FULL        0
+#define BRW_MATH_PRECISION_PARTIAL     1
+
+#define BRW_MATH_SATURATE_NONE         0
+#define BRW_MATH_SATURATE_SATURATE     1
+
+#define BRW_MATH_DATA_VECTOR  0
+#define BRW_MATH_DATA_SCALAR  1
+
+#define BRW_URB_OPCODE_WRITE  0
+
+#define BRW_URB_SWIZZLE_NONE          0
+#define BRW_URB_SWIZZLE_INTERLEAVE    1
+#define BRW_URB_SWIZZLE_TRANSPOSE     2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K     0
+#define BRW_SCRATCH_SPACE_SIZE_2K     1
+#define BRW_SCRATCH_SPACE_SIZE_4K     2
+#define BRW_SCRATCH_SPACE_SIZE_8K     3
+#define BRW_SCRATCH_SPACE_SIZE_16K    4
+#define BRW_SCRATCH_SPACE_SIZE_32K    5
+#define BRW_SCRATCH_SPACE_SIZE_64K    6
+#define BRW_SCRATCH_SPACE_SIZE_128K   7
+#define BRW_SCRATCH_SPACE_SIZE_256K   8
+#define BRW_SCRATCH_SPACE_SIZE_512K   9
+#define BRW_SCRATCH_SPACE_SIZE_1M     10
+#define BRW_SCRATCH_SPACE_SIZE_2M     11
+
+
+
+
+#define CMD_URB_FENCE                 0x6000
+#define CMD_CONST_BUFFER_STATE        0x6001
+#define CMD_CONST_BUFFER              0x6002
+
+#define CMD_STATE_BASE_ADDRESS        0x6101
+#define CMD_STATE_INSN_POINTER        0x6102
+#define CMD_PIPELINE_SELECT           0x6104
+
+#define CMD_PIPELINED_STATE_POINTERS  0x7800
+#define CMD_BINDING_TABLE_PTRS        0x7801
+#define CMD_VERTEX_BUFFER             0x7808
+#define CMD_VERTEX_ELEMENT            0x7809
+#define CMD_INDEX_BUFFER              0x780a
+#define CMD_VF_STATISTICS             0x780b
+
+#define CMD_DRAW_RECT                 0x7900
+#define CMD_BLEND_CONSTANT_COLOR      0x7901
+#define CMD_CHROMA_KEY                0x7904
+#define CMD_DEPTH_BUFFER              0x7905
+#define CMD_POLY_STIPPLE_OFFSET       0x7906
+#define CMD_POLY_STIPPLE_PATTERN      0x7907
+#define CMD_LINE_STIPPLE_PATTERN      0x7908
+#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909
+
+#define CMD_PIPE_CONTROL              0x7a00
+
+#define CMD_3D_PRIM                   0x7b00
+
+#define CMD_MI_FLUSH                  0x0200
+
+
+/* Various values from the R0 vertex header:
+ */
+#define R02_PRIM_END    0x1
+#define R02_PRIM_START  0x2
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_draw.c b/src/gallium/drivers/i965simple/brw_draw.c
new file mode 100644
index 0000000000..648aaa0da5
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_draw.c
@@ -0,0 +1,239 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "brw_batch.h"
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+
+#include "pipe/p_context.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+static unsigned hw_prim[PIPE_PRIM_POLYGON+1] = {
+   _3DPRIM_POINTLIST,
+   _3DPRIM_LINELIST,
+   _3DPRIM_LINELOOP,
+   _3DPRIM_LINESTRIP,
+   _3DPRIM_TRILIST,
+   _3DPRIM_TRISTRIP,
+   _3DPRIM_TRIFAN,
+   _3DPRIM_QUADLIST,
+   _3DPRIM_QUADSTRIP,
+   _3DPRIM_POLYGON
+};
+
+
+static const int reduced_prim[PIPE_PRIM_POLYGON+1] = {
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES
+};
+
+
+/* When the primitive changes, set a state bit and re-validate.  Not
+ * the nicest and would rather deal with this by having all the
+ * programs be immune to the active primitive (ie. cope with all
+ * possibilities).  That may not be realistic however.
+ */
+static void brw_set_prim(struct brw_context *brw, int prim)
+{
+   PRINT("PRIM: %d\n", prim);
+
+   /* Slight optimization to avoid the GS program when not needed:
+    */
+   if (prim == PIPE_PRIM_QUAD_STRIP &&
+       brw->attribs.Raster->flatshade &&
+       brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_FILL &&
+       brw->attribs.Raster->fill_ccw == PIPE_POLYGON_MODE_FILL)
+      prim = PIPE_PRIM_TRIANGLE_STRIP;
+
+   if (prim != brw->primitive) {
+      brw->primitive = prim;
+      brw->state.dirty.brw |= BRW_NEW_PRIMITIVE;
+
+      if (reduced_prim[prim] != brw->reduced_primitive) {
+	 brw->reduced_primitive = reduced_prim[prim];
+	 brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE;
+      }
+
+      brw_validate_state(brw);
+   }
+
+}
+
+
+static unsigned trim(int prim, unsigned length)
+{
+   if (prim == PIPE_PRIM_QUAD_STRIP)
+      return length > 3 ? (length - length % 2) : 0;
+   else if (prim == PIPE_PRIM_QUADS)
+      return length - length % 4;
+   else
+      return length;
+}
+
+
+
+static boolean brw_emit_prim( struct brw_context *brw,
+			      boolean indexed,
+			      unsigned start,
+			      unsigned count )
+
+{
+   struct brw_3d_primitive prim_packet;
+
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      PRINT("PRIM: %d %d %d\n",  brw->primitive, start, count);
+
+   prim_packet.header.opcode = CMD_3D_PRIM;
+   prim_packet.header.length = sizeof(prim_packet)/4 - 2;
+   prim_packet.header.pad = 0;
+   prim_packet.header.topology = hw_prim[brw->primitive];
+   prim_packet.header.indexed = indexed;
+
+   prim_packet.verts_per_instance = trim(brw->primitive, count);
+   prim_packet.start_vert_location = start;
+   prim_packet.instance_count = 1;
+   prim_packet.start_instance_location = 0;
+   prim_packet.base_vert_location = 0;
+
+   if (prim_packet.verts_per_instance == 0)
+      return TRUE;
+
+   return brw_batchbuffer_data( brw->winsys,
+                                &prim_packet,
+                                sizeof(prim_packet) );
+}
+
+
+/* May fail if out of video memory for texture or vbo upload, or on
+ * fallback conditions.
+ */
+static boolean brw_try_draw_elements( struct pipe_context *pipe,
+				      struct pipe_buffer *index_buffer,
+				      unsigned index_size,
+				      unsigned mode,
+				      unsigned start,
+				      unsigned count )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   /* Set the first primitive ahead of validate_state:
+    */
+   brw_set_prim(brw, mode);
+
+   /* Upload index, vertex data:
+    */
+   if (index_buffer &&
+       !brw_upload_indices( brw, index_buffer, index_size, start, count ))
+      return FALSE;
+
+   if (!brw_upload_vertex_buffers(brw))
+      return FALSE;
+
+   if (!brw_upload_vertex_elements( brw ))
+      return FALSE;
+
+   /* XXX:  Need to separate validate and upload of state.
+    */
+   if (brw->state.dirty.brw)
+      brw_validate_state( brw );
+
+   if (!brw_emit_prim(brw, index_buffer != NULL,
+                      start, count))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+
+static boolean brw_draw_elements( struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode,
+				  unsigned start,
+				  unsigned count )
+{
+   if (!brw_try_draw_elements( pipe,
+			       indexBuffer,
+			       indexSize,
+			       mode, start, count ))
+   {
+      /* flush ? */
+
+      if (!brw_try_draw_elements( pipe,
+				  indexBuffer,
+				  indexSize,
+				  mode, start,
+				  count )) {
+	 assert(0);
+	 return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+static boolean brw_draw_arrays( struct pipe_context *pipe,
+				    unsigned mode,
+				    unsigned start,
+				    unsigned count )
+{
+   if (!brw_try_draw_elements( pipe, NULL, 0, mode, start, count )) {
+      /* flush ? */
+
+      if (!brw_try_draw_elements( pipe, NULL, 0, mode, start, count )) {
+	 assert(0);
+	 return FALSE;
+      }
+   }
+   
+   return TRUE;
+}
+
+
+
+void brw_init_draw_functions( struct brw_context *brw )
+{
+   brw->pipe.draw_arrays = brw_draw_arrays;
+   brw->pipe.draw_elements = brw_draw_elements;
+}
+
+
diff --git a/src/gallium/drivers/i965simple/brw_draw.h b/src/gallium/drivers/i965simple/brw_draw.h
new file mode 100644
index 0000000000..62fe0d5d0e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_draw.h
@@ -0,0 +1,55 @@
+ /**************************************************************************
+ * 
+ * Copyright 2005 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_DRAW_H
+#define BRW_DRAW_H
+
+#include "pipe/p_context.h"
+
+struct brw_context;
+
+
+
+void brw_init_draw_functions( struct brw_context *brw );
+
+
+boolean brw_upload_vertices( struct brw_context *brw,
+			       unsigned min_index,
+			       unsigned max_index );
+
+boolean brw_upload_indices(struct brw_context *brw,
+                           const struct pipe_buffer *index_buffer,
+                           int ib_size, int start, int count);
+
+boolean brw_upload_vertex_buffers( struct brw_context *brw );
+boolean brw_upload_vertex_elements( struct brw_context *brw );
+
+unsigned brw_translate_surface_format( unsigned id );
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_draw_upload.c b/src/gallium/drivers/i965simple/brw_draw_upload.c
new file mode 100644
index 0000000000..2d9ca3f2ea
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_draw_upload.c
@@ -0,0 +1,300 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "brw_batch.h"
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+
+
+struct brw_array_state {
+   union header_union header;
+
+   struct {
+      union {
+	 struct {
+	    unsigned pitch:11;
+	    unsigned pad:15;
+	    unsigned access_type:1;
+	    unsigned vb_index:5;
+	 } bits;
+	 unsigned dword;
+      } vb0;
+
+      struct pipe_buffer *buffer;
+      unsigned offset;
+
+      unsigned max_index;
+      unsigned instance_data_step_rate;
+
+   } vb[BRW_VBP_MAX];
+};
+
+
+
+unsigned brw_translate_surface_format( unsigned id )
+{
+   switch (id) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return BRW_SURFACEFORMAT_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return BRW_SURFACEFORMAT_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return BRW_SURFACEFORMAT_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return BRW_SURFACEFORMAT_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return BRW_SURFACEFORMAT_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return BRW_SURFACEFORMAT_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return BRW_SURFACEFORMAT_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return BRW_SURFACEFORMAT_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return BRW_SURFACEFORMAT_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return BRW_SURFACEFORMAT_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return BRW_SURFACEFORMAT_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return BRW_SURFACEFORMAT_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return BRW_SURFACEFORMAT_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return BRW_SURFACEFORMAT_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SSCALED;
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned get_index_type(int type)
+{
+   switch (type) {
+   case 1: return BRW_INDEX_BYTE;
+   case 2: return BRW_INDEX_WORD;
+   case 4: return BRW_INDEX_DWORD;
+   default: assert(0); return 0;
+   }
+}
+
+
+boolean brw_upload_vertex_buffers( struct brw_context *brw )
+{
+   struct brw_array_state vbp;
+   unsigned nr_enabled = 0;
+   unsigned i;
+
+   memset(&vbp, 0, sizeof(vbp));
+
+   /* This is a hardware limit:
+    */
+
+   for (i = 0; i < BRW_VEP_MAX; i++)
+   {
+      if (brw->vb.vbo_array[i] == NULL) {
+	 nr_enabled = i;
+	 break;
+      }
+
+      vbp.vb[i].vb0.bits.pitch = brw->vb.vbo_array[i]->stride;
+      vbp.vb[i].vb0.bits.pad = 0;
+      vbp.vb[i].vb0.bits.access_type = BRW_VERTEXBUFFER_ACCESS_VERTEXDATA;
+      vbp.vb[i].vb0.bits.vb_index = i;
+      vbp.vb[i].offset = brw->vb.vbo_array[i]->buffer_offset;
+      vbp.vb[i].buffer = brw->vb.vbo_array[i]->buffer;
+      vbp.vb[i].max_index = brw->vb.vbo_array[i]->max_index;
+   }
+
+
+   vbp.header.bits.length = (1 + nr_enabled * 4) - 2;
+   vbp.header.bits.opcode = CMD_VERTEX_BUFFER;
+
+   BEGIN_BATCH(vbp.header.bits.length+2, 0);
+   OUT_BATCH( vbp.header.dword );
+
+   for (i = 0; i < nr_enabled; i++) {
+      OUT_BATCH( vbp.vb[i].vb0.dword );
+      OUT_RELOC( vbp.vb[i].buffer,  PIPE_BUFFER_USAGE_GPU_READ,
+		 vbp.vb[i].offset);
+      OUT_BATCH( vbp.vb[i].max_index );
+      OUT_BATCH( vbp.vb[i].instance_data_step_rate );
+   }
+   ADVANCE_BATCH();
+   return TRUE;
+}
+
+
+
+boolean brw_upload_vertex_elements( struct brw_context *brw )
+{
+   struct brw_vertex_element_packet vep;
+
+   unsigned i;
+   unsigned nr_enabled = brw->attribs.VertexProgram->info.num_inputs;
+
+   memset(&vep, 0, sizeof(vep));
+
+   for (i = 0; i < nr_enabled; i++) 
+      vep.ve[i] = brw->vb.inputs[i];
+
+
+   vep.header.length = (1 + nr_enabled * sizeof(vep.ve[0])/4) - 2;
+   vep.header.opcode = CMD_VERTEX_ELEMENT;
+   brw_cached_batch_struct(brw, &vep, 4 + nr_enabled * sizeof(vep.ve[0]));
+
+   return TRUE;
+}
+
+boolean brw_upload_indices( struct brw_context *brw,
+                            const struct pipe_buffer *index_buffer,
+                            int ib_size, int start, int count)
+{
+   /* Emit the indexbuffer packet:
+    */
+   {
+      struct brw_indexbuffer ib;
+
+      memset(&ib, 0, sizeof(ib));
+
+      ib.header.bits.opcode = CMD_INDEX_BUFFER;
+      ib.header.bits.length = sizeof(ib)/4 - 2;
+      ib.header.bits.index_format = get_index_type(ib_size);
+      ib.header.bits.cut_index_enable = 0;
+
+
+      BEGIN_BATCH(4, 0);
+      OUT_BATCH( ib.header.dword );
+      OUT_RELOC( index_buffer, PIPE_BUFFER_USAGE_GPU_READ, start);
+      OUT_RELOC( index_buffer, PIPE_BUFFER_USAGE_GPU_READ, start + count);
+      OUT_BATCH( 0 );
+      ADVANCE_BATCH();
+   }
+   return TRUE;
+}
diff --git a/src/gallium/drivers/i965simple/brw_eu.c b/src/gallium/drivers/i965simple/brw_eu.c
new file mode 100644
index 0000000000..e2002d1821
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu.c
@@ -0,0 +1,130 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+
+/* How does predicate control work when execution_size != 8?  Do I
+ * need to test/set for 0xffff when execution_size is 16?
+ */
+void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value )
+{
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   if (value != 0xff) {
+      if (value != p->flag_value) {
+	 brw_push_insn_state(p);
+	 brw_MOV(p, brw_flag_reg(), brw_imm_uw(value));
+	 p->flag_value = value;
+	 brw_pop_insn_state(p);
+      }
+
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }   
+}
+
+void brw_set_predicate_control( struct brw_compile *p, unsigned pc )
+{
+   p->current->header.predicate_control = pc;
+}
+
+void brw_set_conditionalmod( struct brw_compile *p, unsigned conditional )
+{
+   p->current->header.destreg__conditonalmod = conditional;
+}
+
+void brw_set_access_mode( struct brw_compile *p, unsigned access_mode )
+{
+   p->current->header.access_mode = access_mode;
+}
+
+void brw_set_compression_control( struct brw_compile *p, boolean compression_control )
+{
+   p->current->header.compression_control = compression_control;
+}
+
+void brw_set_mask_control( struct brw_compile *p, unsigned value )
+{
+   p->current->header.mask_control = value;
+}
+
+void brw_set_saturate( struct brw_compile *p, unsigned value )
+{
+   p->current->header.saturate = value;
+}
+
+void brw_push_insn_state( struct brw_compile *p )
+{
+   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+   memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+   p->current++;   
+}
+
+void brw_pop_insn_state( struct brw_compile *p )
+{
+   assert(p->current != p->stack);
+   p->current--;
+}
+
+
+/***********************************************************************
+ */
+void brw_init_compile( struct brw_compile *p )
+{
+   p->nr_insn = 0;
+   p->current = p->stack;
+   memset(p->current, 0, sizeof(p->current[0]));
+
+   /* Some defaults?
+    */
+   brw_set_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+   brw_set_saturate(p, 0);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_predicate_control_flag_value(p, 0xff); 
+}
+
+
+const unsigned *brw_get_program( struct brw_compile *p,
+			       unsigned *sz )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++)
+      brw_NOP(p);
+
+   *sz = p->nr_insn * sizeof(struct brw_instruction);
+   return (const unsigned *)p->store;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_eu.h b/src/gallium/drivers/i965simple/brw_eu.h
new file mode 100644
index 0000000000..23151ae9ed
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu.h
@@ -0,0 +1,888 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include "brw_structs.h"
+#include "brw_defines.h"
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_shader_tokens.h"
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+
+
+#define REG_SIZE (8*4)
+
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges.  Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg
+{
+   unsigned type:4;
+   unsigned file:2;
+   unsigned nr:8;
+   unsigned subnr:5;		/* :1 in align16 */
+   unsigned negate:1;		/* source only */
+   unsigned abs:1;		/* source only */
+   unsigned vstride:4;		/* source only */
+   unsigned width:3;		/* src only, align1 only */
+   unsigned hstride:2;   		/* src only, align1 only */
+   unsigned address_mode:1;	/* relative addressing, hopefully! */
+   unsigned pad0:1;
+
+   union {
+      struct {
+	 unsigned swizzle:8;		/* src only, align16 only */
+	 unsigned writemask:4;		/* dest only, align16 only */
+	 int  indirect_offset:10;	/* relative addressing offset */
+	 unsigned pad1:10;		/* two dwords total */
+      } bits;
+
+      float f;
+      int   d;
+      unsigned ud;
+   } dw1;
+};
+
+
+struct brw_indirect {
+   unsigned addr_subnr:4;
+   int addr_offset:10;
+   unsigned pad:18;
+};
+
+
+#define BRW_EU_MAX_INSN_STACK 5
+#define BRW_EU_MAX_INSN 1200
+
+struct brw_compile {
+   struct brw_instruction store[BRW_EU_MAX_INSN];
+   unsigned nr_insn;
+
+   /* Allow clients to push/pop instruction state:
+    */
+   struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+   struct brw_instruction *current;
+
+   unsigned flag_value;
+   boolean single_program_flow;
+};
+
+
+
+static __inline int type_sz( unsigned type )
+{
+   switch( type ) {
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_F:
+      return 4;
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+      return 2;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+static __inline struct brw_reg brw_reg( unsigned file,
+					unsigned nr,
+					unsigned subnr,
+					unsigned type,
+					unsigned vstride,
+					unsigned width,
+					unsigned hstride,
+					unsigned swizzle,
+					unsigned writemask)
+{
+
+   struct brw_reg reg;
+   reg.type = type;
+   reg.file = file;
+   reg.nr = nr;
+   reg.subnr = subnr * type_sz(type);
+   reg.negate = 0;
+   reg.abs = 0;
+   reg.vstride = vstride;
+   reg.width = width;
+   reg.hstride = hstride;
+   reg.address_mode = BRW_ADDRESS_DIRECT;
+   reg.pad0 = 0;
+
+   /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+    * set swizzle and writemask to W, as the lower bits of subnr will
+    * be lost when converted to align16.  This is probably too much to
+    * keep track of as you'd want it adjusted by suboffset(), etc.
+    * Perhaps fix up when converting to align16?
+    */
+   reg.dw1.bits.swizzle = swizzle;
+   reg.dw1.bits.writemask = writemask;
+   reg.dw1.bits.indirect_offset = 0;
+   reg.dw1.bits.pad1 = 0;
+   return reg;
+}
+
+static __inline struct brw_reg brw_vec16_reg( unsigned file,
+					      unsigned nr,
+					      unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_16,
+		  BRW_WIDTH_16,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  TGSI_WRITEMASK_XYZW);
+}
+
+static __inline struct brw_reg brw_vec8_reg( unsigned file,
+					     unsigned nr,
+					     unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_8,
+		  BRW_WIDTH_8,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  TGSI_WRITEMASK_XYZW);
+}
+
+
+static __inline struct brw_reg brw_vec4_reg( unsigned file,
+					      unsigned nr,
+					      unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_4,
+		  BRW_WIDTH_4,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  TGSI_WRITEMASK_XYZW);
+}
+
+
+static __inline struct brw_reg brw_vec2_reg( unsigned file,
+					      unsigned nr,
+					      unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_2,
+		  BRW_WIDTH_2,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYXY,
+		  TGSI_WRITEMASK_XY);
+}
+
+static __inline struct brw_reg brw_vec1_reg( unsigned file,
+					     unsigned nr,
+					     unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_0,
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XXXX,
+		  TGSI_WRITEMASK_X);
+}
+
+
+static __inline struct brw_reg retype( struct brw_reg reg,
+				       unsigned type )
+{
+   reg.type = type;
+   return reg;
+}
+
+static __inline struct brw_reg suboffset( struct brw_reg reg,
+					  unsigned delta )
+{
+   reg.subnr += delta * type_sz(reg.type);
+   return reg;
+}
+
+
+static __inline struct brw_reg offset( struct brw_reg reg,
+				       unsigned delta )
+{
+   reg.nr += delta;
+   return reg;
+}
+
+
+static __inline struct brw_reg byte_offset( struct brw_reg reg,
+					    unsigned bytes )
+{
+   unsigned newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+   reg.nr = newoffset / REG_SIZE;
+   reg.subnr = newoffset % REG_SIZE;
+   return reg;
+}
+
+
+static __inline struct brw_reg brw_uw16_reg( unsigned file,
+					     unsigned nr,
+					     unsigned subnr )
+{
+   return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static __inline struct brw_reg brw_uw8_reg( unsigned file,
+					    unsigned nr,
+					    unsigned subnr )
+{
+   return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static __inline struct brw_reg brw_uw1_reg( unsigned file,
+					    unsigned nr,
+					    unsigned subnr )
+{
+   return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static __inline struct brw_reg brw_imm_reg( unsigned type )
+{
+   return brw_reg( BRW_IMMEDIATE_VALUE,
+		   0,
+		   0,
+		   type,
+		   BRW_VERTICAL_STRIDE_0,
+		   BRW_WIDTH_1,
+		   BRW_HORIZONTAL_STRIDE_0,
+		   0,
+		   0);
+}
+
+static __inline struct brw_reg brw_imm_f( float f )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+   imm.dw1.f = f;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_d( int d )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+   imm.dw1.d = d;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_ud( unsigned ud )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+   imm.dw1.ud = ud;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_uw( ushort uw )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+   imm.dw1.ud = uw;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_w( short w )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+   imm.dw1.d = w;
+   return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/* Vector of eight signed half-byte values:
+ */
+static __inline struct brw_reg brw_imm_v( unsigned v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_8;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+/* Vector of four 8-bit float values:
+ */
+static __inline struct brw_reg brw_imm_vf( unsigned v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+#define VF_ZERO 0x0
+#define VF_ONE  0x30
+#define VF_NEG  (1<<7)
+
+static __inline struct brw_reg brw_imm_vf4( unsigned v0,
+					    unsigned v1,
+					    unsigned v2,
+					    unsigned v3)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = ((v0 << 0) |
+		 (v1 << 8) |
+		 (v2 << 16) |
+		 (v3 << 24));
+   return imm;
+}
+
+
+static __inline struct brw_reg brw_address( struct brw_reg reg )
+{
+   return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+
+static __inline struct brw_reg brw_vec1_grf( unsigned nr,
+					       unsigned subnr )
+{
+   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_vec8_grf( unsigned nr,
+					     unsigned subnr )
+{
+   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_vec4_grf( unsigned nr,
+					     unsigned subnr )
+{
+   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+static __inline struct brw_reg brw_vec2_grf( unsigned nr,
+					     unsigned subnr )
+{
+   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_uw8_grf( unsigned nr,
+					    unsigned subnr )
+{
+   return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_null_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		       BRW_ARF_NULL,
+		       0);
+}
+
+static __inline struct brw_reg brw_address_reg( unsigned subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_ADDRESS,
+		      subnr);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw.  This goes against the convention for other scalar
+ * regs:
+ */
+static __inline struct brw_reg brw_ip_reg( void )
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		  BRW_ARF_IP,
+		  0,
+		  BRW_REGISTER_TYPE_UD,
+		  BRW_VERTICAL_STRIDE_4, /* ? */
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XYZW, /* NOTE! */
+		  TGSI_WRITEMASK_XYZW); /* NOTE! */
+}
+
+static __inline struct brw_reg brw_acc_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		       BRW_ARF_ACCUMULATOR,
+		       0);
+}
+
+
+static __inline struct brw_reg brw_flag_reg( void )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_FLAG,
+		      0);
+}
+
+
+static __inline struct brw_reg brw_mask_reg( unsigned subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_MASK,
+		      subnr);
+}
+
+static __inline struct brw_reg brw_message_reg( unsigned nr )
+{
+   return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
+		       nr,
+		       0);
+}
+
+
+
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static __inline unsigned cvt( unsigned val )
+{
+   switch (val) {
+   case 0: return 0;
+   case 1: return 1;
+   case 2: return 2;
+   case 4: return 3;
+   case 8: return 4;
+   case 16: return 5;
+   case 32: return 6;
+   }
+   return 0;
+}
+
+static __inline struct brw_reg stride( struct brw_reg reg,
+				       unsigned vstride,
+				       unsigned width,
+				       unsigned hstride )
+{
+
+   reg.vstride = cvt(vstride);
+   reg.width = cvt(width) - 1;
+   reg.hstride = cvt(hstride);
+   return reg;
+}
+
+static __inline struct brw_reg vec16( struct brw_reg reg )
+{
+   return stride(reg, 16,16,1);
+}
+
+static __inline struct brw_reg vec8( struct brw_reg reg )
+{
+   return stride(reg, 8,8,1);
+}
+
+static __inline struct brw_reg vec4( struct brw_reg reg )
+{
+   return stride(reg, 4,4,1);
+}
+
+static __inline struct brw_reg vec2( struct brw_reg reg )
+{
+   return stride(reg, 2,2,1);
+}
+
+static __inline struct brw_reg vec1( struct brw_reg reg )
+{
+   return stride(reg, 0,1,0);
+}
+
+static __inline struct brw_reg get_element( struct brw_reg reg, unsigned elt )
+{
+   return vec1(suboffset(reg, elt));
+}
+
+static __inline struct brw_reg get_element_ud( struct brw_reg reg, unsigned elt )
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+
+static __inline struct brw_reg brw_swizzle( struct brw_reg reg,
+					    unsigned x,
+					    unsigned y,
+					    unsigned z,
+					    unsigned w)
+{
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(reg.dw1.bits.swizzle, x),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, y),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, z),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, w));
+   return reg;
+}
+
+
+static __inline struct brw_reg brw_swizzle1( struct brw_reg reg,
+					     unsigned x )
+{
+   return brw_swizzle(reg, x, x, x, x);
+}
+
+static __inline struct brw_reg brw_writemask( struct brw_reg reg,
+					      unsigned mask )
+{
+   reg.dw1.bits.writemask &= mask;
+   return reg;
+}
+
+static __inline struct brw_reg brw_set_writemask( struct brw_reg reg,
+						  unsigned mask )
+{
+   reg.dw1.bits.writemask = mask;
+   return reg;
+}
+
+static __inline struct brw_reg negate( struct brw_reg reg )
+{
+   reg.negate ^= 1;
+   return reg;
+}
+
+static __inline struct brw_reg brw_abs( struct brw_reg reg )
+{
+   reg.abs = 1;
+   return reg;
+}
+
+/***********************************************************************
+ */
+static __inline struct brw_reg brw_vec4_indirect( unsigned subnr,
+						  int offset )
+{
+   struct brw_reg reg =  brw_vec4_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static __inline struct brw_reg brw_vec1_indirect( unsigned subnr,
+						  int offset )
+{
+   struct brw_reg reg =  brw_vec1_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static __inline struct brw_reg deref_4f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static __inline struct brw_reg deref_1f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static __inline struct brw_reg deref_4b(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static __inline struct brw_reg deref_1uw(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static __inline struct brw_reg deref_1ud(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static __inline struct brw_reg get_addr_reg(struct brw_indirect ptr)
+{
+   return brw_address_reg(ptr.addr_subnr);
+}
+
+static __inline struct brw_indirect brw_indirect_offset( struct brw_indirect ptr, int offset )
+{
+   ptr.addr_offset += offset;
+   return ptr;
+}
+
+static __inline struct brw_indirect brw_indirect( unsigned addr_subnr, int offset )
+{
+   struct brw_indirect ptr;
+   ptr.addr_subnr = addr_subnr;
+   ptr.addr_offset = offset;
+   ptr.pad = 0;
+   return ptr;
+}
+
+static __inline struct brw_instruction *current_insn( struct brw_compile *p)
+{
+	return &p->store[p->nr_insn];
+}
+
+void brw_pop_insn_state( struct brw_compile *p );
+void brw_push_insn_state( struct brw_compile *p );
+void brw_set_mask_control( struct brw_compile *p, unsigned value );
+void brw_set_saturate( struct brw_compile *p, unsigned value );
+void brw_set_access_mode( struct brw_compile *p, unsigned access_mode );
+void brw_set_compression_control( struct brw_compile *p, boolean control );
+void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value );
+void brw_set_predicate_control( struct brw_compile *p, unsigned pc );
+void brw_set_conditionalmod( struct brw_compile *p, unsigned conditional );
+
+void brw_init_compile( struct brw_compile *p );
+const unsigned *brw_get_program( struct brw_compile *p, unsigned *sz );
+
+
+struct brw_instruction *brw_alu1( struct brw_compile *p,
+				  unsigned opcode,
+				  struct brw_reg dest,
+				  struct brw_reg src );
+
+struct brw_instruction *brw_alu2(struct brw_compile *p,
+				 unsigned opcode,
+				 struct brw_reg dest,
+				 struct brw_reg src0,
+				 struct brw_reg src1 );
+
+/* Helpers for regular instructions:
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0);
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1);
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(JMPI)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+#undef ALU1
+#undef ALU2
+
+
+
+/* Helpers for SEND instruction:
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   boolean allocate,
+		   boolean used,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot,
+		   boolean writes_complete,
+		   unsigned offset,
+		   unsigned swizzle);
+
+void brw_fb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   unsigned binding_table_index,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot);
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned writemask,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		boolean eot);
+
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  unsigned function,
+		  unsigned saturate,
+		  unsigned msg_reg_nr,
+		  struct brw_reg src,
+		  unsigned precision );
+
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned saturate,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned data_type,
+	       unsigned precision );
+
+void brw_dp_READ_16( struct brw_compile *p,
+		     struct brw_reg dest,
+		     unsigned msg_reg_nr,
+		     unsigned scratch_offset );
+
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      unsigned msg_reg_nr,
+		      unsigned scratch_offset );
+
+/* If/else/endif.  Works by manipulating the execution flags on each
+ * channel.
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p,
+			       unsigned execute_size);
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p,
+				 struct brw_instruction *if_insn);
+
+void brw_ENDIF(struct brw_compile *p,
+	       struct brw_instruction *if_or_else_insn);
+
+
+/* DO/WHILE loops:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p,
+			       unsigned execute_size);
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p,
+	       struct brw_instruction *patch_insn);
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p);
+struct brw_instruction *brw_CONT(struct brw_compile *p);
+/* Forward jumps:
+ */
+void brw_land_fwd_jump(struct brw_compile *p,
+		       struct brw_instruction *jmp_insn);
+
+
+
+void brw_NOP(struct brw_compile *p);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1);
+
+void brw_print_reg( struct brw_reg reg );
+
+
+/***********************************************************************
+ * brw_eu_util.c:
+ */
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count);
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count);
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_math_invert( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg src);
+
+void brw_set_src1( struct brw_instruction *insn,
+                          struct brw_reg reg );
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_eu_debug.c b/src/gallium/drivers/i965simple/brw_eu_debug.c
new file mode 100644
index 0000000000..4a94ddefa6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu_debug.c
@@ -0,0 +1,90 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#include "pipe/p_debug.h"
+
+#include "brw_eu.h"
+
+void brw_print_reg( struct brw_reg hwreg )
+{
+   static const char *file[] = {
+      "arf",
+      "grf",
+      "msg",
+      "imm"
+   };
+
+   static const char *type[] = {
+      "ud",
+      "d",
+      "uw",
+      "w",
+      "ub",
+      "vf",
+      "hf",
+      "f"
+   };
+
+   debug_printf("%s%s", 
+		hwreg.abs ? "abs/" : "",
+		hwreg.negate ? "-" : "");
+     
+   if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+       hwreg.nr % 2 == 0 &&
+       hwreg.subnr == 0 &&
+       hwreg.vstride == BRW_VERTICAL_STRIDE_8 &&
+       hwreg.width == BRW_WIDTH_8 &&
+       hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+       hwreg.type == BRW_REGISTER_TYPE_F) {
+      debug_printf("vec%d", hwreg.nr);
+   }
+   else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
+	    hwreg.width == BRW_WIDTH_1 &&
+	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
+	    hwreg.type == BRW_REGISTER_TYPE_F) {      
+      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+   }
+   else {
+      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
+		   file[hwreg.file],
+		   hwreg.nr,
+		   hwreg.subnr / type_sz(hwreg.type),
+		   hwreg.vstride ? (1<<(hwreg.vstride-1)) : 0,
+		   1<<hwreg.width,
+		   hwreg.hstride ? (1<<(hwreg.hstride-1)) : 0,		
+		   type[hwreg.type]);
+   }
+}
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_eu_emit.c b/src/gallium/drivers/i965simple/brw_eu_emit.c
new file mode 100644
index 0000000000..400a80b6fb
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu_emit.c
@@ -0,0 +1,1080 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+
+
+/***********************************************************************
+ * Internal helper for constructing instructions
+ */
+
+static void guess_execution_size( struct brw_instruction *insn,
+				  struct brw_reg reg )
+{
+   if (reg.width == BRW_WIDTH_8 &&
+       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED)
+      insn->header.execution_size = BRW_EXECUTE_16;
+   else
+      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
+}
+
+
+static void brw_set_dest( struct brw_instruction *insn,
+			  struct brw_reg dest )
+{
+   insn->bits1.da1.dest_reg_file = dest.file;
+   insn->bits1.da1.dest_reg_type = dest.type;
+   insn->bits1.da1.dest_address_mode = dest.address_mode;
+
+   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
+      insn->bits1.da1.dest_reg_nr = dest.nr;
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
+	 insn->bits1.da1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+      }
+      else {
+	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
+	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
+      }
+   }
+   else {
+      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
+
+      /* These are different sizes in align1 vs align16:
+       */
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+	 insn->bits1.ia1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+      }
+      else {
+	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+      }
+   }
+
+   /* NEW: Set the execution size based on dest.width and
+    * insn->compression_control:
+    */
+   guess_execution_size(insn, dest);
+}
+
+static void brw_set_src0( struct brw_instruction *insn,
+		      struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   insn->bits1.da1.src0_reg_file = reg.file;
+   insn->bits1.da1.src0_reg_type = reg.type;
+   insn->bits2.da1.src0_abs = reg.abs;
+   insn->bits2.da1.src0_negate = reg.negate;
+   insn->bits2.da1.src0_address_mode = reg.address_mode;
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+
+      /* Required to set some fields in src1 as well:
+       */
+      insn->bits1.da1.src1_reg_file = 0; /* arf */
+      insn->bits1.da1.src1_reg_type = reg.type;
+   }
+   else
+   {
+      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
+	    insn->bits2.da1.src0_reg_nr = reg.nr;
+	 }
+	 else {
+	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+	    insn->bits2.da16.src0_reg_nr = reg.nr;
+	 }
+      }
+      else {
+	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
+
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
+	 }
+	 else {
+	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
+	 }
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 &&
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
+	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
+	    insn->bits2.da1.src0_width = reg.width;
+	    insn->bits2.da1.src0_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits2.da16.src0_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+void brw_set_src1( struct brw_instruction *insn,
+			  struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   insn->bits1.da1.src1_reg_file = reg.file;
+   insn->bits1.da1.src1_reg_type = reg.type;
+   insn->bits3.da1.src1_abs = reg.abs;
+   insn->bits3.da1.src1_negate = reg.negate;
+
+   /* Only src1 can be immediate in two-argument instructions.
+    */
+   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+   }
+   else {
+      /* This is a hardware restriction, which may or may not be lifted
+       * in the future:
+       */
+      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
+      //assert (reg.file == BRW_GENERAL_REGISTER_FILE);
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
+	 insn->bits3.da1.src1_reg_nr = reg.nr;
+      }
+      else {
+	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+	 insn->bits3.da16.src1_reg_nr = reg.nr;
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 &&
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
+	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
+	    insn->bits3.da1.src1_width = reg.width;
+	    insn->bits3.da1.src1_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits3.da16.src1_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+
+static void brw_set_math_message( struct brw_instruction *insn,
+				  unsigned msg_length,
+				  unsigned response_length,
+				  unsigned function,
+				  unsigned integer_type,
+				  boolean low_precision,
+				  boolean saturate,
+				  unsigned dataType )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.math.function = function;
+   insn->bits3.math.int_type = integer_type;
+   insn->bits3.math.precision = low_precision;
+   insn->bits3.math.saturate = saturate;
+   insn->bits3.math.data_type = dataType;
+   insn->bits3.math.response_length = response_length;
+   insn->bits3.math.msg_length = msg_length;
+   insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
+   insn->bits3.math.end_of_thread = 0;
+}
+
+static void brw_set_urb_message( struct brw_instruction *insn,
+				 boolean allocate,
+				 boolean used,
+				 unsigned msg_length,
+				 unsigned response_length,
+				 boolean end_of_thread,
+				 boolean complete,
+				 unsigned offset,
+				 unsigned swizzle_control )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.urb.opcode = 0;	/* ? */
+   insn->bits3.urb.offset = offset;
+   insn->bits3.urb.swizzle_control = swizzle_control;
+   insn->bits3.urb.allocate = allocate;
+   insn->bits3.urb.used = used;	/* ? */
+   insn->bits3.urb.complete = complete;
+   insn->bits3.urb.response_length = response_length;
+   insn->bits3.urb.msg_length = msg_length;
+   insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
+   insn->bits3.urb.end_of_thread = end_of_thread;
+}
+
+static void brw_set_dp_write_message( struct brw_instruction *insn,
+				      unsigned binding_table_index,
+				      unsigned msg_control,
+				      unsigned msg_type,
+				      unsigned msg_length,
+				      unsigned pixel_scoreboard_clear,
+				      unsigned response_length,
+				      unsigned end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.dp_write.binding_table_index = binding_table_index;
+   insn->bits3.dp_write.msg_control = msg_control;
+   insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
+   insn->bits3.dp_write.msg_type = msg_type;
+   insn->bits3.dp_write.send_commit_msg = 0;
+   insn->bits3.dp_write.response_length = response_length;
+   insn->bits3.dp_write.msg_length = msg_length;
+   insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+   insn->bits3.urb.end_of_thread = end_of_thread;
+}
+
+static void brw_set_dp_read_message( struct brw_instruction *insn,
+				      unsigned binding_table_index,
+				      unsigned msg_control,
+				      unsigned msg_type,
+				      unsigned target_cache,
+				      unsigned msg_length,
+				      unsigned response_length,
+				      unsigned end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.dp_read.binding_table_index = binding_table_index;
+   insn->bits3.dp_read.msg_control = msg_control;
+   insn->bits3.dp_read.msg_type = msg_type;
+   insn->bits3.dp_read.target_cache = target_cache;
+   insn->bits3.dp_read.response_length = response_length;
+   insn->bits3.dp_read.msg_length = msg_length;
+   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ;
+   insn->bits3.dp_read.end_of_thread = end_of_thread;
+}
+
+static void brw_set_sampler_message( struct brw_instruction *insn,
+				     unsigned binding_table_index,
+				     unsigned sampler,
+				     unsigned msg_type,
+				     unsigned response_length,
+				     unsigned msg_length,
+				     boolean eot)
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.sampler.binding_table_index = binding_table_index;
+   insn->bits3.sampler.sampler = sampler;
+   insn->bits3.sampler.msg_type = msg_type;
+   insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+   insn->bits3.sampler.response_length = response_length;
+   insn->bits3.sampler.msg_length = msg_length;
+   insn->bits3.sampler.end_of_thread = eot;
+   insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
+}
+
+
+
+static struct brw_instruction *next_insn( struct brw_compile *p,
+					  unsigned opcode )
+{
+   struct brw_instruction *insn;
+
+   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
+
+   insn = &p->store[p->nr_insn++];
+   memcpy(insn, p->current, sizeof(*insn));
+
+   /* Reset this one-shot flag:
+    */
+
+   if (p->current->header.destreg__conditonalmod) {
+      p->current->header.destreg__conditonalmod = 0;
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }
+
+   insn->header.opcode = opcode;
+   return insn;
+}
+
+
+struct brw_instruction *brw_alu1( struct brw_compile *p,
+				  unsigned opcode,
+				  struct brw_reg dest,
+				  struct brw_reg src )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   return insn;
+}
+
+struct brw_instruction *brw_alu2(struct brw_compile *p,
+				 unsigned opcode,
+				 struct brw_reg dest,
+				 struct brw_reg src0,
+				 struct brw_reg src1 )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+   return insn;
+}
+
+
+/***********************************************************************
+ * Convenience routines.
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0)   			\
+{							\
+   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
+}
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1)   			\
+{							\
+   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
+}
+
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+
+
+
+void brw_NOP(struct brw_compile *p)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
+   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src1(insn, brw_imm_ud(0x0));
+}
+
+
+
+
+
+/***********************************************************************
+ * Comparisons, if/else/endif
+ */
+
+struct brw_instruction *brw_JMPI(struct brw_compile *p,
+	      struct brw_reg dest,
+	      struct brw_reg src0,
+	      struct brw_reg src1)
+{
+   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+/* EU takes the value from the flag register and pushes it onto some
+ * sort of a stack (presumably merging with any flag value already on
+ * the stack).  Within an if block, the flags at the top of the stack
+ * control execution on each channel of the unit, eg. on each of the
+ * 16 pixel values in our wm programs.
+ *
+ * When the matching 'else' instruction is reached (presumably by
+ * countdown of the instruction count patched in by our ELSE/ENDIF
+ * functions), the relevent flags are inverted.
+ *
+ * When the matching 'endif' instruction is reached, the flags are
+ * popped off.  If the stack is now empty, normal execution resumes.
+ *
+ * No attempt is made to deal with stack overflow (14 elements?).
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p, unsigned execute_size)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow) {
+      assert(execute_size == BRW_EXECUTE_1);
+
+      insn = next_insn(p, BRW_OPCODE_ADD);
+      insn->header.predicate_inverse = 1;
+   } else {
+      insn = next_insn(p, BRW_OPCODE_IF);
+   }
+
+   /* Override the defaults for this instruction:
+    */
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.execution_size = execute_size;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p,
+				 struct brw_instruction *if_insn)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow) {
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   } else {
+      insn = next_insn(p, BRW_OPCODE_ELSE);
+   }
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = if_insn->header.execution_size;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+
+   /* Patch the if instruction to point at this instruction.
+    */
+   if (p->single_program_flow) {
+      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
+
+      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
+   } else {
+      assert(if_insn->header.opcode == BRW_OPCODE_IF);
+
+      if_insn->bits3.if_else.jump_count = insn - if_insn;
+      if_insn->bits3.if_else.pop_count = 1;
+      if_insn->bits3.if_else.pad0 = 0;
+   }
+
+   return insn;
+}
+
+void brw_ENDIF(struct brw_compile *p,
+	       struct brw_instruction *patch_insn)
+{
+   if (p->single_program_flow) {
+      /* In single program flow mode, there's no need to execute an ENDIF,
+       * since we don't need to do any stack operations, and if we're executing
+       * currently, we want to just continue executing.
+       */
+      struct brw_instruction *next = &p->store[p->nr_insn];
+
+      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
+
+      patch_insn->bits3.ud = (next - patch_insn) * 16;
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
+
+      brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src1(insn, brw_imm_d(0x0));
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = patch_insn->header.execution_size;
+      insn->header.mask_control = BRW_MASK_ENABLE;
+
+      assert(patch_insn->bits3.if_else.jump_count == 0);
+
+      /* Patch the if or else instructions to point at this or the next
+       * instruction respectively.
+       */
+      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
+	 /* Automagically turn it into an IFF:
+	  */
+	 patch_insn->header.opcode = BRW_OPCODE_IFF;
+	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.pop_count = 0;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
+	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.pop_count = 1;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else {
+	 assert(0);
+      }
+
+      /* Also pop item off the stack in the endif instruction:
+       */
+      insn->bits3.if_else.jump_count = 0;
+      insn->bits3.if_else.pop_count = 1;
+      insn->bits3.if_else.pad0 = 0;
+   }
+}
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_BREAK);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+struct brw_instruction *brw_CONT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+/* DO/WHILE loop:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
+{
+   if (p->single_program_flow) {
+      return &p->store[p->nr_insn];
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
+
+      /* Override the defaults for this instruction:
+       */
+      brw_set_dest(insn, brw_null_reg());
+      brw_set_src0(insn, brw_null_reg());
+      brw_set_src1(insn, brw_null_reg());
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = execute_size;
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      /* insn->header.mask_control = BRW_MASK_ENABLE; */
+      insn->header.mask_control = BRW_MASK_DISABLE;
+
+      return insn;
+   }
+}
+
+
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p,
+	       struct brw_instruction *do_insn)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow)
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   else
+      insn = next_insn(p, BRW_OPCODE_WHILE);
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+   if (p->single_program_flow) {
+      insn->header.execution_size = BRW_EXECUTE_1;
+
+      insn->bits3.d = (do_insn - insn) * 16;
+   } else {
+      insn->header.execution_size = do_insn->header.execution_size;
+
+      assert(do_insn->header.opcode == BRW_OPCODE_DO);
+      insn->bits3.if_else.jump_count = do_insn - insn;
+      insn->bits3.if_else.pop_count = 0;
+      insn->bits3.if_else.pad0 = 0;
+   }
+
+/*    insn->header.mask_control = BRW_MASK_ENABLE; */
+
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+   return insn;
+}
+
+
+/* FORWARD JUMPS:
+ */
+void brw_land_fwd_jump(struct brw_compile *p,
+		       struct brw_instruction *jmp_insn)
+{
+   struct brw_instruction *landing = &p->store[p->nr_insn];
+
+   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
+   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
+
+   jmp_insn->bits3.ud = (landing - jmp_insn) - 1;
+}
+
+
+
+/* To integrate with the above, it makes sense that the comparison
+ * instruction should populate the flag register.  It might be simpler
+ * just to use the flag reg for most WM tasks?
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
+
+   insn->header.destreg__conditonalmod = conditional;
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+
+/*    guess_execution_size(insn, src0); */
+
+
+   /* Make it so that future instructions will use the computed flag
+    * value until brw_set_predicate_control_flag_value() is called
+    * again.
+    */
+   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+       dest.nr == 0) {
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+      p->flag_value = 0xff;
+   }
+}
+
+
+
+/***********************************************************************
+ * Helpers for the various SEND message types:
+ */
+
+/* Invert 8 values
+ */
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned saturate,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned data_type,
+	       unsigned precision )
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   unsigned msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
+   unsigned response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+
+   /* Example code doesn't set predicate_control for send
+    * instructions.
+    */
+   insn->header.predicate_control = 0;
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(insn,
+			msg_length, response_length,
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			data_type);
+}
+
+/* Use 2 send instructions to invert 16 elements
+ */
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  unsigned function,
+		  unsigned saturate,
+		  unsigned msg_reg_nr,
+		  struct brw_reg src,
+		  unsigned precision )
+{
+   struct brw_instruction *insn;
+   unsigned msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
+   unsigned response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+
+   /* First instruction:
+    */
+   brw_push_insn_state(p);
+   brw_set_predicate_control_flag_value(p, 0xff);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(insn,
+			msg_length, response_length,
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   /* Second instruction:
+    */
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
+   insn->header.destreg__conditonalmod = msg_reg_nr+1;
+
+   brw_set_dest(insn, offset(dest,1));
+   brw_set_src0(insn, src);
+   brw_set_math_message(insn,
+			msg_length, response_length,
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   brw_pop_insn_state(p);
+}
+
+
+
+
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      unsigned msg_reg_nr,
+		      unsigned scratch_offset )
+{
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      unsigned msg_length = 3;
+      struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+
+      brw_set_dp_write_message(insn,
+			       255, /* bti */
+			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
+			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
+			       msg_length,
+			       0, /* pixel scoreboard */
+			       0, /* response_length */
+			       0); /* eot */
+   }
+
+}
+
+
+void brw_dp_READ_16( struct brw_compile *p,
+		      struct brw_reg dest,
+		      unsigned msg_reg_nr,
+		      unsigned scratch_offset )
+{
+   {
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);	/* UW? */
+      brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
+
+      brw_set_dp_read_message(insn,
+			      255, /* bti */
+			      3,  /* msg_control */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      1, /* target cache */
+			      1, /* msg_length */
+			      2, /* response_length */
+			      0); /* eot */
+   }
+}
+
+
+void brw_fb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   unsigned binding_table_index,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   insn->header.predicate_control = 0; /* XXX */
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_dp_write_message(insn,
+			    binding_table_index,
+			    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
+			    BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
+			    msg_length,
+			    1,	/* pixel scoreboard */
+			    response_length,
+			    eot);
+}
+
+
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned writemask,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		boolean eot)
+{
+   boolean need_stall = 0;
+
+   if(writemask == 0) {
+/*       debug_printf("%s: zero writemask??\n", __FUNCTION__); */
+      return;
+   }
+
+   /* Hardware doesn't do destination dependency checking on send
+    * instructions properly.  Add a workaround which generates the
+    * dependency by other means.  In practice it seems like this bug
+    * only crops up for texture samples, and only where registers are
+    * written by the send and then written again later without being
+    * read in between.  Luckily for us, we already track that
+    * information and use it to modify the writemask for the
+    * instruction, so that is a guide for whether a workaround is
+    * needed.
+    */
+   if (writemask != TGSI_WRITEMASK_XYZW) {
+      unsigned dst_offset = 0;
+      unsigned i, newmask = 0, len = 0;
+
+      for (i = 0; i < 4; i++) {
+	 if (writemask & (1<<i))
+	    break;
+	 dst_offset += 2;
+      }
+      for (; i < 4; i++) {
+	 if (!(writemask & (1<<i)))
+	    break;
+	 newmask |= 1<<i;
+	 len++;
+      }
+
+      if (newmask != writemask) {
+	 need_stall = 1;
+/* 	 debug_printf("need stall %x %x\n", newmask , writemask); */
+      }
+      else {
+	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
+
+	 newmask = ~newmask & TGSI_WRITEMASK_XYZW;
+
+	 brw_push_insn_state(p);
+
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	 brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+	 brw_MOV(p, m1, brw_vec8_grf(0,0));
+  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
+
+	 brw_pop_insn_state(p);
+
+  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+	 dest = offset(dest, dst_offset);
+	 response_length = len * 2;
+      }
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src0);
+      brw_set_sampler_message(insn,
+			      binding_table_index,
+			      sampler,
+			      msg_type,
+			      response_length,
+			      msg_length,
+			      eot);
+   }
+
+   if (need_stall)
+   {
+      struct brw_reg reg = vec8(offset(dest, response_length-1));
+
+      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
+       */
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, FALSE);
+      brw_MOV(p, reg, reg);
+      brw_pop_insn_state(p);
+   }
+
+}
+
+/* All these variables are pretty confusing - we might be better off
+ * using bitmasks and macros for this, in the old style.  Or perhaps
+ * just having the caller instantiate the fields in dword3 itself.
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   boolean allocate,
+		   boolean used,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot,
+		   boolean writes_complete,
+		   unsigned offset,
+		   unsigned swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < 16);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_urb_message(insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length,
+		       eot,
+		       writes_complete,
+		       offset,
+		       swizzle);
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_eu_util.c b/src/gallium/drivers/i965simple/brw_eu_util.c
new file mode 100644
index 0000000000..3a65b141f0
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu_util.c
@@ -0,0 +1,126 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_compile *p, 
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   brw_math( p, 
+	     dst,
+	     BRW_MATH_FUNCTION_INV, 
+	     BRW_MATH_SATURATE_NONE,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL, 
+	     BRW_MATH_DATA_VECTOR );
+}
+
+
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_flush.c b/src/gallium/drivers/i965simple/brw_flush.c
new file mode 100644
index 0000000000..e6001c30d9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_flush.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_batch.h"
+
+
+static void brw_flush( struct pipe_context *pipe,
+                       unsigned flags,
+                       struct pipe_fence_handle **fence )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   /* Do we need to emit an MI_FLUSH command to flush the hardware
+    * caches?
+    */
+   if (flags & (PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_TEXTURE_CACHE)) {
+      struct brw_mi_flush flush;
+
+      memset(&flush, 0, sizeof(flush));      
+      flush.opcode = CMD_MI_FLUSH;
+
+      if (!(flags & PIPE_FLUSH_RENDER_CACHE))
+	 flush.flags |= BRW_INHIBIT_FLUSH_RENDER_CACHE;
+
+      if (flags & PIPE_FLUSH_TEXTURE_CACHE)
+	 flush.flags |= BRW_FLUSH_READ_CACHE;
+
+      BRW_BATCH_STRUCT(brw, &flush);
+   }
+
+   /* If there are no flags, just flush pending commands to hardware:
+    */
+   FLUSH_BATCH( fence );
+}
+
+
+
+void brw_init_flush_functions( struct brw_context *brw )
+{
+   brw->pipe.flush = brw_flush;
+}
diff --git a/src/gallium/drivers/i965simple/brw_gs.c b/src/gallium/drivers/i965simple/brw_gs.c
new file mode 100644
index 0000000000..de60868ccc
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs.c
@@ -0,0 +1,196 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_gs.h"
+
+
+
+static void compile_gs_prog( struct brw_context *brw,
+			     struct brw_gs_prog_key *key )
+{
+   struct brw_gs_compile c;
+   const unsigned *program;
+   unsigned program_size;
+
+   memset(&c, 0, sizeof(c));
+
+   c.key = *key;
+
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(&c.func);
+
+   c.func.single_program_flow = 1;
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Note that primitives which don't require a GS program have
+    * already been weeded out by this stage:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_QUADS:
+      brw_gs_quads( &c );
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      brw_gs_quad_strip( &c );
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      brw_gs_lines( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      if (key->hint_gs_always)
+	 brw_gs_lines( &c );
+      else {
+	 return;
+      }
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      if (key->hint_gs_always)
+	 brw_gs_tris( &c );
+      else {
+	 return;
+      }
+      break;
+   case PIPE_PRIM_POINTS:
+      if (key->hint_gs_always)
+	 brw_gs_points( &c );
+      else {
+	 return;
+      }
+      break;
+   default:
+      return;
+   }
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   brw->gs.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_GS_PROG],
+					      &c.key,
+					      sizeof(c.key),
+					      program,
+					      program_size,
+					      &c.prog_data,
+					      &brw->gs.prog_data );
+}
+
+
+static boolean search_cache( struct brw_context *brw,
+			       struct brw_gs_prog_key *key )
+{
+   return brw_search_cache(&brw->cache[BRW_GS_PROG],
+			   key, sizeof(*key),
+			   &brw->gs.prog_data,
+			   &brw->gs.prog_gs_offset);
+}
+
+
+static const int gs_prim[PIPE_PRIM_POLYGON+1] = {
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINE_LOOP,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_QUADS,
+   PIPE_PRIM_QUAD_STRIP,
+   PIPE_PRIM_TRIANGLES
+};
+
+static void populate_key( struct brw_context *brw,
+			  struct brw_gs_prog_key *key )
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_VS_PROG */
+   key->attrs = brw->vs.prog_data->outputs_written;
+
+   /* BRW_NEW_PRIMITIVE */
+   key->primitive = gs_prim[brw->primitive];
+
+   key->hint_gs_always = 0;	/* debug code? */
+
+   key->need_gs_prog = (key->hint_gs_always ||
+			brw->primitive == PIPE_PRIM_QUADS ||
+			brw->primitive == PIPE_PRIM_QUAD_STRIP ||
+			brw->primitive == PIPE_PRIM_LINE_LOOP);
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_gs_prog( struct brw_context *brw )
+{
+   struct brw_gs_prog_key key;
+
+   /* Populate the key:
+    */
+   populate_key(brw, &key);
+
+   if (brw->gs.prog_active != key.need_gs_prog) {
+      brw->state.dirty.cache |= CACHE_NEW_GS_PROG;
+      brw->gs.prog_active = key.need_gs_prog;
+   }
+
+   if (brw->gs.prog_active) {
+      if (!search_cache(brw, &key))
+	 compile_gs_prog( brw, &key );
+   }
+}
+
+
+const struct brw_tracked_state brw_gs_prog = {
+   .dirty = {
+      .brw   = BRW_NEW_PRIMITIVE,
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .update = upload_gs_prog
+};
diff --git a/src/gallium/drivers/i965simple/brw_gs.h b/src/gallium/drivers/i965simple/brw_gs.h
new file mode 100644
index 0000000000..f09141c6aa
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs.h
@@ -0,0 +1,75 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#ifndef BRW_GS_H
+#define BRW_GS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_GS_VERTS (4)	     
+
+struct brw_gs_prog_key {
+   unsigned attrs:32;
+   unsigned primitive:4;
+   unsigned hint_gs_always:1;
+   unsigned need_gs_prog:1;
+   unsigned pad:26;
+};
+
+struct brw_gs_compile {
+   struct brw_compile func;
+   struct brw_gs_prog_key key;
+   struct brw_gs_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_GS_VERTS];
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   unsigned nr_attrs;
+   unsigned nr_regs;
+   unsigned nr_bytes;
+};
+
+#define ATTR_SIZE  (4*4)
+
+void brw_gs_quads( struct brw_gs_compile *c );
+void brw_gs_quad_strip( struct brw_gs_compile *c );
+void brw_gs_tris( struct brw_gs_compile *c );
+void brw_gs_lines( struct brw_gs_compile *c );
+void brw_gs_points( struct brw_gs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_gs_emit.c b/src/gallium/drivers/i965simple/brw_gs_emit.c
new file mode 100644
index 0000000000..c3cc90b10f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs_emit.c
@@ -0,0 +1,148 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_gs.h"
+
+static void brw_gs_alloc_regs( struct brw_gs_compile *c,
+			       unsigned nr_verts )
+{
+   unsigned i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->prog_data.urb_read_length = c->nr_regs; 
+   c->prog_data.total_grf = i;
+}
+
+
+static void brw_gs_emit_vue(struct brw_gs_compile *c, 
+			    struct brw_reg vert,
+			    boolean last,
+			    unsigned header)
+{
+   struct brw_compile *p = &c->func;
+   boolean allocate = !last;
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);
+
+   /* Send each vertex as a seperate write to the urb.  This is
+    * different to the concept in brw_sf_emit.c, where subsequent
+    * writes are used to build up a single urb entry.  Each of these
+    * writes instantiates a seperate urb entry, and a new one must be
+    * allocated each time.
+    */
+   brw_urb_WRITE(p, 
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response length */
+		 allocate ? 0 : 1, /* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_gs_quads( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   /* Use polygons for correct edgeflag behaviour. Note that vertex 3
+    * is the PV for quads, but vertex 0 for polygons:
+    */
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_quad_strip( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_tris( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 3);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
+}
+
+void brw_gs_lines( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 2);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
+}
+
+void brw_gs_points( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 1);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
+}
+
+
+
+
+
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_gs_state.c b/src/gallium/drivers/i965simple/brw_gs_state.c
new file mode 100644
index 0000000000..5b8016b2e9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs_state.c
@@ -0,0 +1,90 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+
+static void upload_gs_unit( struct brw_context *brw )
+{
+   struct brw_gs_unit_state gs;
+
+   memset(&gs, 0, sizeof(gs));
+
+   /* CACHE_NEW_GS_PROG */
+   if (brw->gs.prog_active) {
+      gs.thread0.grf_reg_count =
+	 align(brw->gs.prog_data->total_grf, 16) / 16 - 1;
+      gs.thread0.kernel_start_pointer = brw->gs.prog_gs_offset >> 6;
+      gs.thread3.urb_entry_read_length = brw->gs.prog_data->urb_read_length;
+   }
+   else {
+      gs.thread0.grf_reg_count = 0;
+      gs.thread0.kernel_start_pointer = 0;
+      gs.thread3.urb_entry_read_length = 1;
+   }
+
+   /* BRW_NEW_URB_FENCE */
+   gs.thread4.nr_urb_entries = brw->urb.nr_gs_entries;
+   gs.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+
+   gs.thread4.max_threads = 0; /* Hardware requirement */
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      gs.thread4.stats_enable = 1;
+
+   /* CONSTANT */
+   gs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   gs.thread1.single_program_flow = 1;
+   gs.thread3.dispatch_grf_start_reg = 1;
+   gs.thread3.const_urb_entry_read_offset = 0;
+   gs.thread3.const_urb_entry_read_length = 0;
+   gs.thread3.urb_entry_read_offset = 0;
+
+
+   brw->gs.state_gs_offset = brw_cache_data( &brw->cache[BRW_GS_UNIT], &gs );
+}
+
+
+const struct brw_tracked_state brw_gs_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_GS_PROG
+   },
+   .update = upload_gs_unit
+};
diff --git a/src/gallium/drivers/i965simple/brw_misc_state.c b/src/gallium/drivers/i965simple/brw_misc_state.c
new file mode 100644
index 0000000000..99ff4403a5
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_misc_state.c
@@ -0,0 +1,488 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_batch.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+
+
+
+
+/***********************************************************************
+ * Blend color
+ */
+
+static void upload_blend_constant_color(struct brw_context *brw)
+{
+   struct brw_blend_constant_color bcc;
+
+   memset(&bcc, 0, sizeof(bcc));
+   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
+   bcc.header.length = sizeof(bcc)/4-2;
+   bcc.blend_constant_color[0] = brw->attribs.BlendColor.color[0];
+   bcc.blend_constant_color[1] = brw->attribs.BlendColor.color[1];
+   bcc.blend_constant_color[2] = brw->attribs.BlendColor.color[2];
+   bcc.blend_constant_color[3] = brw->attribs.BlendColor.color[3];
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+}
+
+
+const struct brw_tracked_state brw_blend_constant_color = {
+   .dirty = {
+      .brw = BRW_NEW_BLEND,
+      .cache = 0
+   },
+   .update = upload_blend_constant_color
+};
+
+
+/***********************************************************************
+ * Drawing rectangle 
+ */
+static void upload_drawing_rect(struct brw_context *brw)
+{
+   struct brw_drawrect bdr;
+
+   memset(&bdr, 0, sizeof(bdr));
+   bdr.header.opcode = CMD_DRAW_RECT;
+   bdr.header.length = sizeof(bdr)/4 - 2;
+   bdr.xmin = 0;
+   bdr.ymin = 0;
+   bdr.xmax = brw->attribs.FrameBuffer.cbufs[0]->width;
+   bdr.ymax = brw->attribs.FrameBuffer.cbufs[0]->height;
+   bdr.xorg = 0;
+   bdr.yorg = 0;
+
+   /* Can't use BRW_CACHED_BATCH_STRUCT because this is also emitted
+    * uncached in brw_draw.c:
+    */
+   BRW_BATCH_STRUCT(brw, &bdr);
+}
+
+const struct brw_tracked_state brw_drawing_rect = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_drawing_rect
+};
+
+/**
+ * Upload the binding table pointers, which point each stage's array of surface
+ * state pointers.
+ *
+ * The binding table pointers are relative to the surface state base address,
+ * which is the BRW_SS_POOL cache buffer.
+ */
+static void upload_binding_table_pointers(struct brw_context *brw)
+{
+   struct brw_binding_table_pointers btp;
+   memset(&btp, 0, sizeof(btp));
+
+   btp.header.opcode = CMD_BINDING_TABLE_PTRS;
+   btp.header.length = sizeof(btp)/4 - 2;
+   btp.vs = 0;
+   btp.gs = 0;
+   btp.clp = 0;
+   btp.sf = 0;
+   btp.wm = brw->wm.bind_ss_offset;
+
+   BRW_CACHED_BATCH_STRUCT(brw, &btp);
+}
+
+const struct brw_tracked_state brw_binding_table_pointers = {
+   .dirty = {
+      .brw = 0,
+      .cache = CACHE_NEW_SURF_BIND
+   },
+   .update = upload_binding_table_pointers,
+};
+
+
+/**
+ * Upload pointers to the per-stage state.
+ *
+ * The state pointers in this packet are all relative to the general state
+ * base address set by CMD_STATE_BASE_ADDRESS, which is the BRW_GS_POOL buffer.
+ */
+static void upload_pipelined_state_pointers(struct brw_context *brw )
+{
+   struct brw_pipelined_state_pointers psp;
+   memset(&psp, 0, sizeof(psp));
+
+   psp.header.opcode = CMD_PIPELINED_STATE_POINTERS;
+   psp.header.length = sizeof(psp)/4 - 2;
+
+   psp.vs.offset = brw->vs.state_gs_offset >> 5;
+   psp.sf.offset = brw->sf.state_gs_offset >> 5;
+   psp.wm.offset = brw->wm.state_gs_offset >> 5;
+   psp.cc.offset = brw->cc.state_gs_offset >> 5;
+
+   /* GS gets turned on and off regularly.  Need to re-emit URB fence
+    * after this occurs.
+    */
+   if (brw->gs.prog_active) {
+      psp.gs.offset = brw->gs.state_gs_offset >> 5;
+      psp.gs.enable = 1;
+   }
+
+   if (0) {
+      psp.clp.offset = brw->clip.state_gs_offset >> 5;
+      psp.clp.enable = 1;
+   }
+
+
+   if (BRW_CACHED_BATCH_STRUCT(brw, &psp))
+      brw->state.dirty.brw |= BRW_NEW_PSP;
+}
+
+const struct brw_tracked_state brw_pipelined_state_pointers = {
+   .dirty = {
+      .brw = 0,
+      .cache = (CACHE_NEW_VS_UNIT |
+		CACHE_NEW_GS_UNIT |
+		CACHE_NEW_GS_PROG |
+		CACHE_NEW_CLIP_UNIT |
+		CACHE_NEW_SF_UNIT |
+		CACHE_NEW_WM_UNIT |
+		CACHE_NEW_CC_UNIT)
+   },
+   .update = upload_pipelined_state_pointers
+};
+
+static void upload_psp_urb_cbs(struct brw_context *brw )
+{
+   upload_pipelined_state_pointers(brw);
+   brw_upload_urb_fence(brw);
+   brw_upload_constant_buffer_state(brw);
+}
+
+
+const struct brw_tracked_state brw_psp_urb_cbs = {
+   .dirty = {
+      .brw = BRW_NEW_URB_FENCE,
+      .cache = (CACHE_NEW_VS_UNIT |
+		CACHE_NEW_GS_UNIT |
+		CACHE_NEW_GS_PROG |
+		CACHE_NEW_CLIP_UNIT |
+		CACHE_NEW_SF_UNIT |
+		CACHE_NEW_WM_UNIT |
+		CACHE_NEW_CC_UNIT)
+   },
+   .update = upload_psp_urb_cbs
+};
+
+/**
+ * Upload the depthbuffer offset and format.
+ *
+ * We have to do this per state validation as we need to emit the relocation
+ * in the batch buffer.
+ */
+static void upload_depthbuffer(struct brw_context *brw)
+{
+   struct pipe_surface *depth_surface = brw->attribs.FrameBuffer.zsbuf;
+
+   BEGIN_BATCH(5, INTEL_BATCH_NO_CLIPRECTS);
+   OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (5 - 2));
+   if (depth_surface == NULL) {
+      OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
+		(BRW_SURFACE_NULL << 29));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   } else {
+      unsigned int format;
+      struct brw_texture *tex = (struct brw_texture *)depth_surface->texture;
+      assert(depth_surface->block.width == 1);
+      assert(depth_surface->block.height == 1);
+      switch (depth_surface->block.size) {
+      case 2:
+	 format = BRW_DEPTHFORMAT_D16_UNORM;
+	 break;
+      case 4:
+	 if (depth_surface->format == PIPE_FORMAT_Z32_FLOAT)
+	    format = BRW_DEPTHFORMAT_D32_FLOAT;
+	 else
+	    format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+	 break;
+      default:
+	 assert(0);
+	 return;
+      }
+
+      OUT_BATCH((depth_surface->stride - 1) |
+		(format << 18) |
+		(BRW_TILEWALK_YMAJOR << 26) |
+//		(depth_surface->region->tiled << 27) |
+		(BRW_SURFACE_2D << 29));
+      OUT_RELOC(tex->buffer,
+		PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE, 0);
+      OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
+		((depth_surface->stride/depth_surface->block.size - 1) << 6) |
+		((depth_surface->height - 1) << 19));
+      OUT_BATCH(0);
+   }
+   ADVANCE_BATCH();
+}
+
+const struct brw_tracked_state brw_depthbuffer = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_depthbuffer,
+};
+
+
+
+
+/***********************************************************************
+ * Polygon stipple packet
+ */
+
+static void upload_polygon_stipple(struct brw_context *brw)
+{
+   struct brw_polygon_stipple bps;
+   unsigned i;
+
+   memset(&bps, 0, sizeof(bps));
+   bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
+   bps.header.length = sizeof(bps)/4-2;
+
+   /* XXX: state tracker should send *all* state down initially!
+    */
+   if (brw->attribs.PolygonStipple)
+      for (i = 0; i < 32; i++)
+	 bps.stipple[i] = brw->attribs.PolygonStipple->stipple[31 - i]; /* invert */
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bps);
+}
+
+const struct brw_tracked_state brw_polygon_stipple = {
+   .dirty = {
+      .brw = BRW_NEW_STIPPLE,
+      .cache = 0
+   },
+   .update = upload_polygon_stipple
+};
+
+
+/***********************************************************************
+ * Line stipple packet
+ */
+
+static void upload_line_stipple(struct brw_context *brw)
+{
+   struct brw_line_stipple bls;
+   float tmp;
+   int tmpi;
+
+   memset(&bls, 0, sizeof(bls));
+   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
+   bls.header.length = sizeof(bls)/4 - 2;
+
+   bls.bits0.pattern = brw->attribs.Raster->line_stipple_pattern;
+   bls.bits1.repeat_count = brw->attribs.Raster->line_stipple_factor;
+
+   tmp = 1.0 / (float) brw->attribs.Raster->line_stipple_factor;
+   tmpi = tmp * (1<<13);
+
+
+   bls.bits1.inverse_repeat_count = tmpi;
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bls);
+}
+
+const struct brw_tracked_state brw_line_stipple = {
+   .dirty = {
+      .brw = BRW_NEW_STIPPLE,
+      .cache = 0
+   },
+   .update = upload_line_stipple
+};
+
+
+/***********************************************************************
+ * Misc constant state packets
+ */
+
+static void upload_pipe_control(struct brw_context *brw)
+{
+   struct brw_pipe_control pc;
+
+   return;
+
+   memset(&pc, 0, sizeof(pc));
+
+   pc.header.opcode = CMD_PIPE_CONTROL;
+   pc.header.length = sizeof(pc)/4 - 2;
+   pc.header.post_sync_operation = PIPE_CONTROL_NOWRITE;
+
+   pc.header.instruction_state_cache_flush_enable = 1;
+
+   pc.bits1.dest_addr_type = PIPE_CONTROL_GTTWRITE_GLOBAL;
+
+   BRW_BATCH_STRUCT(brw, &pc);
+}
+
+const struct brw_tracked_state brw_pipe_control = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_pipe_control
+};
+
+
+/***********************************************************************
+ * Misc invarient state packets
+ */
+
+static void upload_invarient_state( struct brw_context *brw )
+{
+   {
+      struct brw_mi_flush flush;
+
+      memset(&flush, 0, sizeof(flush));      
+      flush.opcode = CMD_MI_FLUSH;
+      flush.flags = BRW_FLUSH_STATE_CACHE | BRW_FLUSH_READ_CACHE;
+      BRW_BATCH_STRUCT(brw, &flush);
+   }
+
+   {
+      /* 0x61040000  Pipeline Select */
+      /*     PipelineSelect            : 0 */
+      struct brw_pipeline_select ps;
+
+      memset(&ps, 0, sizeof(ps));
+      ps.header.opcode = CMD_PIPELINE_SELECT;
+      ps.header.pipeline_select = 0;
+      BRW_BATCH_STRUCT(brw, &ps);
+   }
+
+   {
+      struct brw_global_depth_offset_clamp gdo;
+      memset(&gdo, 0, sizeof(gdo));
+
+      /* Disable depth offset clamping.
+       */
+      gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP;
+      gdo.header.length = sizeof(gdo)/4 - 2;
+      gdo.depth_offset_clamp = 0.0;
+
+      BRW_BATCH_STRUCT(brw, &gdo);
+   }
+
+
+   /* 0x61020000  State Instruction Pointer */
+   {
+      struct brw_system_instruction_pointer sip;
+      memset(&sip, 0, sizeof(sip));
+
+      sip.header.opcode = CMD_STATE_INSN_POINTER;
+      sip.header.length = 0;
+      sip.bits0.pad = 0;
+      sip.bits0.system_instruction_pointer = 0;
+      BRW_BATCH_STRUCT(brw, &sip);
+   }
+
+
+   {
+      struct brw_vf_statistics vfs;
+      memset(&vfs, 0, sizeof(vfs));
+
+      vfs.opcode = CMD_VF_STATISTICS;
+      if (BRW_DEBUG & DEBUG_STATS)
+	 vfs.statistics_enable = 1;
+
+      BRW_BATCH_STRUCT(brw, &vfs);
+   }
+
+   
+   {
+      struct brw_polygon_stipple_offset bpso;
+      
+      memset(&bpso, 0, sizeof(bpso));
+      bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
+      bpso.header.length = sizeof(bpso)/4-2;      
+      bpso.bits0.x_offset = 0;
+      bpso.bits0.y_offset = 0;
+
+      BRW_BATCH_STRUCT(brw, &bpso);
+   }
+}
+
+const struct brw_tracked_state brw_invarient_state = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_invarient_state
+};
+
+/**
+ * Define the base addresses which some state is referenced from.
+ *
+ * This allows us to avoid having to emit relocations in many places for
+ * cached state, and instead emit pointers inside of large, mostly-static
+ * state pools.  This comes at the expense of memory, and more expensive cache
+ * misses.
+ */
+static void upload_state_base_address( struct brw_context *brw )
+{
+   /* Output the structure (brw_state_base_address) directly to the
+    * batchbuffer, so we can emit relocations inline.
+    */
+   BEGIN_BATCH(6, INTEL_BATCH_NO_CLIPRECTS);
+   OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
+   OUT_RELOC(brw->pool[BRW_GS_POOL].buffer,
+	     PIPE_BUFFER_USAGE_GPU_READ,
+	     1); /* General state base address */
+   OUT_RELOC(brw->pool[BRW_SS_POOL].buffer,
+	     PIPE_BUFFER_USAGE_GPU_READ,
+	     1); /* Surface state base address */
+   OUT_BATCH(1); /* Indirect object base address */
+   OUT_BATCH(1); /* General state upper bound */
+   OUT_BATCH(1); /* Indirect object upper bound */
+   ADVANCE_BATCH();
+}
+
+
+const struct brw_tracked_state brw_state_base_address = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_state_base_address
+};
diff --git a/src/gallium/drivers/i965simple/brw_reg.h b/src/gallium/drivers/i965simple/brw_reg.h
new file mode 100644
index 0000000000..9e885c3b3b
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_reg.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#define CMD_MI				(0x0 << 29)
+#define CMD_2D				(0x2 << 29)
+#define CMD_3D				(0x3 << 29)
+
+#define MI_BATCH_BUFFER_END		(CMD_MI | 0xA << 23)
+
+/* Stalls command execution waiting for the given events to have occurred. */
+#define MI_WAIT_FOR_EVENT               (CMD_MI | (0x3 << 23))
+#define MI_WAIT_FOR_PLANE_B_FLIP        (1<<6)
+#define MI_WAIT_FOR_PLANE_A_FLIP        (1<<2)
+
+/* Primitive dispatch on 830-945 */
+#define _3DPRIMITIVE			(CMD_3D | (0x1f << 24))
+#define PRIM_INDIRECT            (1<<23)
+#define PRIM_INLINE              (0<<23)
+#define PRIM_INDIRECT_SEQUENTIAL (0<<17)
+#define PRIM_INDIRECT_ELTS       (1<<17)
+
+#define PRIM3D_TRILIST		(0x0<<18)
+#define PRIM3D_TRISTRIP 	(0x1<<18)
+#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
+#define PRIM3D_TRIFAN		(0x3<<18)
+#define PRIM3D_POLY		(0x4<<18)
+#define PRIM3D_LINELIST 	(0x5<<18)
+#define PRIM3D_LINESTRIP	(0x6<<18)
+#define PRIM3D_RECTLIST 	(0x7<<18)
+#define PRIM3D_POINTLIST	(0x8<<18)
+#define PRIM3D_DIB		(0x9<<18)
+#define PRIM3D_MASK		(0x1f<<18)
+
+#define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22) | 6)
+
+#define XY_COLOR_BLT_CMD		(CMD_2D | (0x50 << 22) | 4)
+
+#define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22) | 6)
+
+/* BR00 */
+#define XY_BLT_WRITE_ALPHA	(1 << 21)
+#define XY_BLT_WRITE_RGB	(1 << 20)
+#define XY_SRC_TILED		(1 << 15)
+#define XY_DST_TILED		(1 << 11)
+
+/* BR13 */
+#define BR13_565		(0x1 << 24)
+#define BR13_8888		(0x3 << 24)
+
+#define FENCE_LINEAR 0
+#define FENCE_XMAJOR 1
+#define FENCE_YMAJOR 2
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
new file mode 100644
index 0000000000..b22e105f10
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -0,0 +1,246 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_string.h"
+#include "util/u_simple_screen.h"
+
+#include "brw_context.h"
+#include "brw_screen.h"
+#include "brw_tex_layout.h"
+
+
+static const char *
+brw_get_vendor( struct pipe_screen *screen )
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+brw_get_name( struct pipe_screen *screen )
+{
+   static char buffer[128];
+   const char *chipset;
+
+   switch (brw_screen(screen)->pci_id) {
+   case PCI_CHIP_I965_Q:
+      chipset = "Intel(R) 965Q";
+      break;
+   case PCI_CHIP_I965_G:
+   case PCI_CHIP_I965_G_1:
+      chipset = "Intel(R) 965G";
+      break;
+   case PCI_CHIP_I965_GM:
+      chipset = "Intel(R) 965GM";
+      break;
+   case PCI_CHIP_I965_GME:
+      chipset = "Intel(R) 965GME/GLE";
+      break;
+   default:
+      chipset = "unknown";
+      break;
+   }
+
+   util_snprintf(buffer, sizeof(buffer), "i965 (chipset: %s)", chipset);
+   return buffer;
+}
+
+
+static int
+brw_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 8;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 0;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 0;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 11; /* max 1024x1024 */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 11; /* max 1024x1024 */
+   default:
+      return 0;
+   }
+}
+
+
+static float
+brw_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.5;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 4.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   default:
+      return 0;
+   }
+}
+
+
+static boolean
+brw_is_format_supported( struct pipe_screen *screen,
+                         enum pipe_format format, 
+                         enum pipe_texture_target target,
+                         unsigned tex_usage, 
+                         unsigned geom_flags )
+{
+#if 0
+   /* XXX: This is broken -- rewrite if still needed. */
+   static const unsigned tex_supported[] = {
+      PIPE_FORMAT_R8G8B8A8_UNORM,
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+      PIPE_FORMAT_L8_UNORM,
+      PIPE_FORMAT_A8_UNORM,
+      PIPE_FORMAT_I8_UNORM,
+      PIPE_FORMAT_L8A8_UNORM,
+      PIPE_FORMAT_YCBCR,
+      PIPE_FORMAT_YCBCR_REV,
+      PIPE_FORMAT_S8_Z24,
+   };
+
+
+   /* Actually a lot more than this - add later:
+    */
+   static const unsigned render_supported[] = {
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+   };
+
+   /*
+    */
+   static const unsigned z_stencil_supported[] = {
+      PIPE_FORMAT_Z16_UNORM,
+      PIPE_FORMAT_Z32_UNORM,
+      PIPE_FORMAT_S8Z24_UNORM,
+   };
+
+   switch (type) {
+   case PIPE_RENDER_FORMAT:
+      *numFormats = Elements(render_supported);
+      return render_supported;
+
+   case PIPE_TEX_FORMAT:
+      *numFormats = Elements(tex_supported);
+      return render_supported;
+
+   case PIPE_Z_STENCIL_FORMAT:
+      *numFormats = Elements(render_supported);
+      return render_supported;
+
+   default:
+      *numFormats = 0;
+      return NULL;
+   }
+#else
+   switch (format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      return TRUE;
+   default:
+      return FALSE;
+   };
+   return FALSE;
+#endif
+}
+
+
+static void
+brw_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+/**
+ * Create a new brw_screen object
+ */
+struct pipe_screen *
+brw_create_screen(struct pipe_winsys *winsys, uint pci_id)
+{
+   struct brw_screen *brwscreen = CALLOC_STRUCT(brw_screen);
+
+   if (!brwscreen)
+      return NULL;
+
+   brwscreen->pci_id = pci_id;
+
+   brwscreen->screen.winsys = winsys;
+
+   brwscreen->screen.destroy = brw_destroy_screen;
+
+   brwscreen->screen.get_name = brw_get_name;
+   brwscreen->screen.get_vendor = brw_get_vendor;
+   brwscreen->screen.get_param = brw_get_param;
+   brwscreen->screen.get_paramf = brw_get_paramf;
+   brwscreen->screen.is_format_supported = brw_is_format_supported;
+
+   brw_init_screen_texture_funcs(&brwscreen->screen);
+   u_simple_screen_init(&brwscreen->screen);
+
+   return &brwscreen->screen;
+}
diff --git a/src/gallium/drivers/i965simple/brw_screen.h b/src/gallium/drivers/i965simple/brw_screen.h
new file mode 100644
index 0000000000..d3c70387e6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_screen.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef BRW_SCREEN_H
+#define BRW_SCREEN_H
+
+
+#include "pipe/p_screen.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct brw_screen
+{
+   struct pipe_screen screen;
+
+   uint pci_id;
+};
+
+
+/** cast wrapper */
+static INLINE struct brw_screen *
+brw_screen(struct pipe_screen *pscreen)
+{
+   return (struct brw_screen *) pscreen;
+}
+
+
+extern struct pipe_screen *
+brw_create_screen(struct pipe_winsys *winsys, uint pci_id);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965simple/brw_sf.c b/src/gallium/drivers/i965simple/brw_sf.c
new file mode 100644
index 0000000000..b82a2e143b
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf.c
@@ -0,0 +1,351 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+#include "brw_state.h"
+#include "tgsi/tgsi_parse.h"
+
+
+static void compile_sf_prog( struct brw_context *brw,
+			     struct brw_sf_prog_key *key )
+{
+   struct brw_sf_compile c;
+   const unsigned *program;
+   unsigned program_size;
+
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(&c.func);
+
+   c.key = *key;
+
+
+   c.nr_attrs = c.key.vp_output_count;
+   c.nr_attr_regs = (c.nr_attrs+1)/2;
+
+   c.nr_setup_attrs = c.key.fp_input_count + 1; /* +1 for position */
+   c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
+
+   c.prog_data.urb_read_length = c.nr_attr_regs;
+   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
+
+
+   /* Which primitive?  Or all three?
+    */
+   switch (key->primitive) {
+   case SF_TRIANGLES:
+      c.nr_verts = 3;
+      brw_emit_tri_setup( &c );
+      break;
+   case SF_LINES:
+      c.nr_verts = 2;
+      brw_emit_line_setup( &c );
+      break;
+   case SF_POINTS:
+      c.nr_verts = 1;
+      brw_emit_point_setup( &c );
+      break;
+
+   case SF_UNFILLED_TRIS:
+   default:
+      assert(0);
+      return;
+   }
+
+
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   brw->sf.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_SF_PROG],
+					      &c.key,
+					      sizeof(c.key),
+					      program,
+					      program_size,
+					      &c.prog_data,
+					      &brw->sf.prog_data );
+}
+
+
+static boolean search_cache( struct brw_context *brw,
+			       struct brw_sf_prog_key *key )
+{
+   return brw_search_cache(&brw->cache[BRW_SF_PROG],
+			   key, sizeof(*key),
+			   &brw->sf.prog_data,
+			   &brw->sf.prog_gs_offset);
+}
+
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_sf_prog( struct brw_context *brw )
+{
+   const struct brw_fragment_program *fs = brw->attribs.FragmentProgram;
+   struct brw_sf_prog_key key;
+   struct tgsi_parse_context parse;
+   int i, done = 0;
+
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key, noting state dependencies:
+    */
+   /* CACHE_NEW_VS_PROG */
+   key.vp_output_count = brw->vs.prog_data->outputs_written;
+
+   /* BRW_NEW_FS */
+   key.fp_input_count = brw->attribs.FragmentProgram->info.file_max[TGSI_FILE_INPUT] + 1;
+
+
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   switch (brw->reduced_primitive) {
+   case PIPE_PRIM_TRIANGLES:
+//      if (key.attrs & (1<<VERT_RESULT_EDGE))
+//	 key.primitive = SF_UNFILLED_TRIS;
+//      else
+      key.primitive = SF_TRIANGLES;
+      break;
+   case PIPE_PRIM_LINES:
+      key.primitive = SF_LINES;
+      break;
+   case PIPE_PRIM_POINTS:
+      key.primitive = SF_POINTS;
+      break;
+   }
+
+
+
+   /* Scan fp inputs to figure out what interpolation modes are
+    * required for each incoming vp output.  There is an assumption
+    * that the state tracker makes sure there is a 1:1 linkage between
+    * these sets of attributes (XXX: position??)
+    */
+   tgsi_parse_init( &parse, fs->program.tokens );
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
+	 {
+	    int first = parse.FullToken.FullDeclaration.DeclarationRange.First;
+	    int last = parse.FullToken.FullDeclaration.DeclarationRange.Last;
+	    int interp_mode = parse.FullToken.FullDeclaration.Declaration.Interpolate;
+	    //int semantic = parse.FullToken.FullDeclaration.Semantic.SemanticName;
+	    //int semantic_index = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
+
+	    debug_printf("fs input %d..%d interp mode %d\n", first, last, interp_mode);
+	    
+	    switch (interp_mode) {
+	    case TGSI_INTERPOLATE_CONSTANT:
+	       for (i = first; i <= last; i++) 
+		  key.const_mask |= (1 << i);
+	       break;
+	    case TGSI_INTERPOLATE_LINEAR:
+	       for (i = first; i <= last; i++) 
+		  key.linear_mask |= (1 << i);
+	       break;
+	    case TGSI_INTERPOLATE_PERSPECTIVE:
+	       for (i = first; i <= last; i++) 
+		  key.persp_mask |= (1 << i);
+	       break;
+	    default:
+	       break;
+	    }
+
+	    /* Also need stuff for flat shading, twosided color.
+	     */
+
+	 }
+	 break;
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+   /* Hack: Adjust for position.  Optimize away when not required (ie
+    * for perspective interpolation).
+    */
+   key.persp_mask <<= 1;
+   key.linear_mask <<= 1; 
+   key.linear_mask |= 1;
+   key.const_mask <<= 1;
+
+   debug_printf("key.persp_mask: %x\n", key.persp_mask);
+   debug_printf("key.linear_mask: %x\n", key.linear_mask);
+   debug_printf("key.const_mask: %x\n", key.const_mask);
+
+
+//   key.do_point_sprite = brw->attribs.Point->PointSprite;
+//   key.SpriteOrigin = brw->attribs.Point->SpriteOrigin;
+
+//   key.do_flat_shading = (brw->attribs.Raster->flatshade);
+//   key.do_twoside_color = (brw->attribs.Light->Enabled && brw->attribs.Light->Model.TwoSide);
+
+//   if (key.do_twoside_color)
+//      key.frontface_ccw = (brw->attribs.Polygon->FrontFace == GL_CCW);
+
+
+   if (!search_cache(brw, &key))
+      compile_sf_prog( brw, &key );
+}
+
+
+const struct brw_tracked_state brw_sf_prog = {
+   .dirty = {
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_REDUCED_PRIMITIVE |
+		BRW_NEW_VS |
+		BRW_NEW_FS),
+      .cache = 0,
+   },
+   .update = upload_sf_prog
+};
+
+
+
+#if 0
+/* Build a struct like the one we'd like the state tracker to pass to
+ * us.
+ */
+static void update_sf_linkage( struct brw_context *brw )
+{
+   const struct brw_vertex_program *vs = brw->attribs.VertexProgram;
+   const struct brw_fragment_program *fs = brw->attribs.FragmentProgram;
+   struct pipe_setup_linkage state;
+   struct tgsi_parse_context parse;
+
+   int i, j;
+   int nr_vp_outputs = 0;
+   int done = 0;
+
+   struct { 
+      unsigned semantic:8;
+      unsigned semantic_index:16;
+   } fp_semantic[32], vp_semantic[32];
+
+   memset(&state, 0, sizeof(state));
+
+   state.fp_input_count = 0;
+
+
+
+   
+
+
+   assert(state.fp_input_count == fs->program.num_inputs);
+
+      
+   /* Then scan vp outputs
+    */
+   done = 0;
+   tgsi_parse_init( &parse, vs->program.tokens );
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
+	 {
+	    int first = parse.FullToken.FullDeclaration.DeclarationRange.First;
+	    int last = parse.FullToken.FullDeclaration.DeclarationRange.Last;
+
+	    for (i = first; i < last; i++) {
+	       vp_semantic[i].semantic = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticName;
+	       vp_semantic[i].semantic_index = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
+	    }
+	    
+	    assert(last > nr_vp_outputs);
+	    nr_vp_outputs = last;
+	 }
+	 break;
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+
+   /* Now match based on semantic information.
+    */
+   for (i = 0; i< state.fp_input_count; i++) {
+      for (j = 0; j < nr_vp_outputs; j++) {
+	 if (fp_semantic[i].semantic == vp_semantic[j].semantic &&
+	     fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
+	    state.fp_input[i].vp_output = j;
+	 }
+      }
+      if (fp_semantic[i].semantic == TGSI_SEMANTIC_COLOR) {
+	 for (j = 0; j < nr_vp_outputs; j++) {
+	    if (TGSI_SEMANTIC_BCOLOR == vp_semantic[j].semantic &&
+		fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
+	       state.fp_input[i].bf_vp_output = j;
+	    }
+	 }
+      }
+   }
+
+   if (memcmp(&brw->sf.linkage, &state, sizeof(state)) != 0) {
+      brw->sf.linkage = state;
+      brw->state.dirty.brw |= BRW_NEW_SF_LINKAGE;
+   }
+}
+
+
+const struct brw_tracked_state brw_sf_linkage = {
+   .dirty = {
+      .brw   = (BRW_NEW_VS |
+		BRW_NEW_FS),
+      .cache = 0,
+   },
+   .update = update_sf_linkage
+};
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_sf.h b/src/gallium/drivers/i965simple/brw_sf.h
new file mode 100644
index 0000000000..b7ada47560
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf.h
@@ -0,0 +1,122 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_SF_H
+#define BRW_SF_H
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+#define SF_POINTS    0
+#define SF_LINES     1
+#define SF_TRIANGLES 2
+#define SF_UNFILLED_TRIS   3
+
+
+
+struct brw_sf_prog_key {
+   unsigned vp_output_count:5;
+   unsigned fp_input_count:5;
+
+   unsigned primitive:2;
+   unsigned do_twoside_color:1;
+   unsigned do_flat_shading:1;
+   unsigned frontface_ccw:1;
+   unsigned do_point_sprite:1;
+
+   /* Interpolation masks;
+    */
+   unsigned linear_mask;
+   unsigned persp_mask;
+   unsigned const_mask;
+
+
+//   int SpriteOrigin;
+};
+
+struct brw_sf_point_tex {
+	boolean CoordReplace;
+};
+
+struct brw_sf_compile {
+   struct brw_compile func;
+   struct brw_sf_prog_key key;
+   struct brw_sf_prog_data prog_data;
+
+   struct brw_reg pv;
+   struct brw_reg det;
+   struct brw_reg dx0;
+   struct brw_reg dx2;
+   struct brw_reg dy0;
+   struct brw_reg dy2;
+
+   /* z and 1/w passed in seperately:
+    */
+   struct brw_reg z[3];
+   struct brw_reg inv_w[3];
+
+   /* The vertices:
+    */
+   struct brw_reg vert[3];
+
+    /* Temporaries, allocated after last vertex reg.
+    */
+   struct brw_reg inv_det;
+   struct brw_reg a1_sub_a0;
+   struct brw_reg a2_sub_a0;
+   struct brw_reg tmp;
+
+   struct brw_reg m1Cx;
+   struct brw_reg m2Cy;
+   struct brw_reg m3C0;
+
+   unsigned nr_verts;
+   unsigned nr_attrs;
+   unsigned nr_attr_regs;
+   unsigned nr_setup_attrs;
+   unsigned nr_setup_regs;
+#if 0
+   ubyte attr_to_idx[VERT_RESULT_MAX];
+   ubyte idx_to_attr[VERT_RESULT_MAX];
+   struct brw_sf_point_tex point_attrs[VERT_RESULT_MAX];
+#endif
+};
+
+
+void brw_emit_tri_setup( struct brw_sf_compile *c );
+void brw_emit_line_setup( struct brw_sf_compile *c );
+void brw_emit_point_setup( struct brw_sf_compile *c );
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c );
+void brw_emit_anyprim_setup( struct brw_sf_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_sf_emit.c b/src/gallium/drivers/i965simple/brw_sf_emit.c
new file mode 100644
index 0000000000..78d6fa5e9e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf_emit.c
@@ -0,0 +1,382 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+
+
+
+/***********************************************************************
+ * Triangle setup.
+ */
+
+
+static void alloc_regs( struct brw_sf_compile *c )
+{
+   unsigned reg, i;
+
+   /* Values computed by fixed function unit:
+    */
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD);
+   c->det = brw_vec1_grf(1, 2);
+   c->dx0 = brw_vec1_grf(1, 3);
+   c->dx2 = brw_vec1_grf(1, 4);
+   c->dy0 = brw_vec1_grf(1, 5);
+   c->dy2 = brw_vec1_grf(1, 6);
+
+   /* z and 1/w passed in seperately:
+    */
+   c->z[0]     = brw_vec1_grf(2, 0);
+   c->inv_w[0] = brw_vec1_grf(2, 1);
+   c->z[1]     = brw_vec1_grf(2, 2);
+   c->inv_w[1] = brw_vec1_grf(2, 3);
+   c->z[2]     = brw_vec1_grf(2, 4);
+   c->inv_w[2] = brw_vec1_grf(2, 5);
+
+   /* The vertices:
+    */
+   reg = 3;
+   for (i = 0; i < c->nr_verts; i++) {
+      c->vert[i] = brw_vec8_grf(reg, 0);
+      reg += c->nr_attr_regs;
+   }
+
+   /* Temporaries, allocated after last vertex reg.
+    */
+   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
+   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->tmp = brw_vec8_grf(reg, 0);  reg++;
+
+   /* Note grf allocation:
+    */
+   c->prog_data.total_grf = reg;
+
+
+   /* Outputs of this program - interpolation coefficients for
+    * rasterization:
+    */
+   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
+   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
+   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
+}
+
+
+static void copy_z_inv_w( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+   brw_push_insn_state(p);
+
+   /* Copy both scalars with a single MOV:
+    */
+   for (i = 0; i < c->nr_verts; i++)
+      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
+
+   brw_pop_insn_state(p);
+}
+
+
+static void invert_det( struct brw_sf_compile *c)
+{
+   brw_math(&c->func,
+	    c->inv_det,
+	    BRW_MATH_FUNCTION_INV,
+	    BRW_MATH_SATURATE_NONE,
+	    0,
+	    c->det,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+
+}
+
+#define NON_PERPECTIVE_ATTRS  (FRAG_BIT_WPOS | \
+                               FRAG_BIT_COL0 | \
+			       FRAG_BIT_COL1)
+
+static boolean calculate_masks( struct brw_sf_compile *c,
+				  unsigned reg,
+				  ushort *pc,
+				  ushort *pc_persp,
+				  ushort *pc_linear)
+{
+   boolean is_last_attr = (reg == c->nr_setup_regs - 1);
+   unsigned persp_mask = c->key.persp_mask;
+   unsigned linear_mask = c->key.linear_mask;
+
+   debug_printf("persp_mask: %x\n", persp_mask);
+   debug_printf("linear_mask: %x\n", linear_mask);
+
+   *pc_persp = 0;
+   *pc_linear = 0;
+   *pc = 0xf;
+
+   if (persp_mask & (1 << (reg*2)))
+      *pc_persp = 0xf;
+
+   if (linear_mask & (1 << (reg*2)))
+      *pc_linear = 0xf;
+
+   /* Maybe only processs one attribute on the final round:
+    */
+   if (reg*2+1 < c->nr_setup_attrs) {
+      *pc |= 0xf0;
+
+      if (persp_mask & (1 << (reg*2+1)))
+	 *pc_persp |= 0xf0;
+
+      if (linear_mask & (1 << (reg*2+1)))
+	 *pc_linear |= 0xf0;
+   }
+
+   debug_printf("pc: %x\n", *pc);
+   debug_printf("pc_persp: %x\n", *pc_persp);
+   debug_printf("pc_linear: %x\n", *pc_linear);
+   
+
+   return is_last_attr;
+}
+
+
+
+void brw_emit_tri_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+   debug_printf("%s START ==============\n", __FUNCTION__);
+
+   c->nr_verts = 3;
+   alloc_regs(c);
+   invert_det(c);
+   copy_z_inv_w(c);
+
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      struct brw_reg a2 = offset(c->vert[2], i);
+      ushort pc = 0, pc_persp = 0, pc_linear = 0;
+      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+	 brw_MUL(p, a2, a2, c->inv_w[2]);
+      }
+
+
+      /* Calculate coefficients for interpolated values:
+       */
+      if (pc_linear)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
+
+	 /* calculate dA/dx
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
+	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 /* calculate dA/dy
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
+	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc);
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
+	  * the send instruction:
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last,	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
+      }
+   }
+
+   debug_printf("%s DONE ==============\n", __FUNCTION__);
+
+}
+
+
+
+void brw_emit_line_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+
+   c->nr_verts = 2;
+   alloc_regs(c);
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      ushort pc, pc_persp, pc_linear;
+      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+      }
+
+      /* Calculate coefficients for position, color:
+       */
+      if (pc_linear) {
+	 brw_set_predicate_control_flag_value(p, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+
+ 	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc);
+
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1, 	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
+
+/* Points setup - several simplifications as all attributes are
+ * constant across the face of the point (point sprites excluded!)
+ */
+void brw_emit_point_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+   c->nr_verts = 1;
+   alloc_regs(c);
+   copy_z_inv_w(c);
+
+   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
+   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      ushort pc, pc_persp, pc_linear;
+      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 /* This seems odd as the values are all constant, but the
+	  * fragment shader will be expecting it:
+	  */
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+
+      /* The delta values are always zero, just send the starting
+       * coordinate.  Again, this is to fit in with the interpolation
+       * code in the fragment shader.
+       */
+      {
+	 brw_set_predicate_control_flag_value(p, pc);
+
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
diff --git a/src/gallium/drivers/i965simple/brw_sf_state.c b/src/gallium/drivers/i965simple/brw_sf_state.c
new file mode 100644
index 0000000000..2a5de61c21
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf_state.c
@@ -0,0 +1,181 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+static void upload_sf_vp(struct brw_context *brw)
+{
+   struct brw_sf_viewport sfv;
+
+   memset(&sfv, 0, sizeof(sfv));
+
+
+   /* BRW_NEW_VIEWPORT */
+   {
+      const float *scale = brw->attribs.Viewport.scale;
+      const float *trans = brw->attribs.Viewport.translate;
+
+      sfv.viewport.m00 = scale[0];
+      sfv.viewport.m11 = scale[1];
+      sfv.viewport.m22 = scale[2]; 
+      sfv.viewport.m30 = trans[0];
+      sfv.viewport.m31 = trans[1];
+      sfv.viewport.m32 = trans[2];
+   }
+
+   /* _NEW_SCISSOR */
+   sfv.scissor.xmin = brw->attribs.Scissor.minx;
+   sfv.scissor.xmax = brw->attribs.Scissor.maxx - 1;
+   sfv.scissor.ymin = brw->attribs.Scissor.miny;
+   sfv.scissor.ymax = brw->attribs.Scissor.maxy - 1;
+
+   brw->sf.vp_gs_offset = brw_cache_data( &brw->cache[BRW_SF_VP], &sfv );
+}
+
+const struct brw_tracked_state brw_sf_vp = {
+   .dirty = {
+      .brw   = (BRW_NEW_SCISSOR |
+		BRW_NEW_VIEWPORT),
+      .cache = 0
+   },
+   .update = upload_sf_vp
+};
+
+static void upload_sf_unit( struct brw_context *brw )
+{
+   struct brw_sf_unit_state sf;
+   memset(&sf, 0, sizeof(sf));
+
+   /* CACHE_NEW_SF_PROG */
+   sf.thread0.grf_reg_count = align(brw->sf.prog_data->total_grf, 16) / 16 - 1;
+   sf.thread0.kernel_start_pointer = brw->sf.prog_gs_offset >> 6;
+   sf.thread3.urb_entry_read_length = brw->sf.prog_data->urb_read_length;
+
+   sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   sf.thread3.dispatch_grf_start_reg = 3;
+   sf.thread3.urb_entry_read_offset = 1;
+
+   /* BRW_NEW_URB_FENCE */
+   sf.thread4.nr_urb_entries = brw->urb.nr_sf_entries;
+   sf.thread4.urb_entry_allocation_size = brw->urb.sfsize - 1;
+   sf.thread4.max_threads = MIN2(12, brw->urb.nr_sf_entries / 2) - 1;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      sf.thread4.max_threads = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      sf.thread4.stats_enable = 1;
+
+   /* CACHE_NEW_SF_VP */
+   sf.sf5.sf_viewport_state_offset = brw->sf.vp_gs_offset >> 5;
+   sf.sf5.viewport_transform = 1;
+
+   /* BRW_NEW_RASTER */
+   if (brw->attribs.Raster->scissor)
+      sf.sf6.scissor = 1;
+
+#if 0
+   if (brw->attribs.Polygon->FrontFace == GL_CCW)
+      sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
+   else
+      sf.sf5.front_winding = BRW_FRONTWINDING_CW;
+
+
+   if (brw->attribs.Polygon->CullFlag) {
+      switch (brw->attribs.Polygon->CullFaceMode) {
+      case GL_FRONT:
+	 sf.sf6.cull_mode = BRW_CULLMODE_FRONT;
+	 break;
+      case GL_BACK:
+	 sf.sf6.cull_mode = BRW_CULLMODE_BACK;
+	 break;
+      case GL_FRONT_AND_BACK:
+	 sf.sf6.cull_mode = BRW_CULLMODE_BOTH;
+	 break;
+      default:
+	 assert(0);
+	 break;
+      }
+   }
+   else
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+#else
+   sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
+   sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+#endif
+
+   sf.sf6.line_width = CLAMP(brw->attribs.Raster->line_width, 1.0, 5.0) * (1<<1);
+
+   sf.sf6.line_endcap_aa_region_width = 1;
+   if (brw->attribs.Raster->line_smooth)
+      sf.sf6.aa_enable = 1;
+   else if (sf.sf6.line_width <= 0x2)
+       sf.sf6.line_width = 0;
+
+   sf.sf6.point_rast_rule = 1;	/* opengl conventions */
+
+   sf.sf7.sprite_point = brw->attribs.Raster->point_sprite;
+   sf.sf7.point_size = CLAMP(brw->attribs.Raster->line_width, 1.0, 255.0) * (1<<3);
+   sf.sf7.use_point_size_state = !brw->attribs.Raster->point_size_per_vertex;
+
+   /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
+    */
+   sf.sf7.trifan_pv = 2;
+   sf.sf7.linestrip_pv = 1;
+   sf.sf7.tristrip_pv = 2;
+   sf.sf7.line_last_pixel_enable = 0;
+
+   /* Set bias for OpenGL rasterization rules:
+    */
+   sf.sf6.dest_org_vbias = 0x8;
+   sf.sf6.dest_org_hbias = 0x8;
+
+   brw->sf.state_gs_offset = brw_cache_data( &brw->cache[BRW_SF_UNIT], &sf );
+}
+
+
+const struct brw_tracked_state brw_sf_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_URB_FENCE),
+      .cache = (CACHE_NEW_SF_VP |
+		CACHE_NEW_SF_PROG)
+   },
+   .update = upload_sf_unit
+};
+
+
diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c
new file mode 100644
index 0000000000..86d877d7ef
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_shader_info.c
@@ -0,0 +1,48 @@
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+
+/**
+ * XXX this obsolete new and no longer compiled.
+ */
+void brw_shader_info(const struct tgsi_token *tokens,
+		     struct brw_shader_info *info )
+{
+   struct tgsi_parse_context parse;
+   int done = 0;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+	 unsigned last = decl->DeclarationRange.Last;
+      
+	 // Broken by crazy wpos init:
+	 //assert( info->nr_regs[decl->Declaration.File] <= last);
+
+	 info->nr_regs[decl->Declaration.File] = MAX2(info->nr_regs[decl->Declaration.File],
+						      last+1);
+	 break;
+      }
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+}
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
new file mode 100644
index 0000000000..b47f5373f3
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -0,0 +1,469 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:  Zack Rusin <zack@tungstengraphics.com>
+ *           Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "brw_draw.h"
+
+
+#define DUP( TYPE, VAL )                        \
+do {                                            \
+   struct TYPE *x = malloc(sizeof(*x));         \
+   memcpy(x, VAL, sizeof(*x) );                 \
+   return x;                                    \
+} while (0)
+
+/************************************************************************
+ * Blend 
+ */
+static void *
+brw_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{   
+   DUP( pipe_blend_state, blend );
+}
+
+static void brw_bind_blend_state(struct pipe_context *pipe,
+                                 void *blend)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Blend = (struct pipe_blend_state*)blend;
+   brw->state.dirty.brw |= BRW_NEW_BLEND;
+}
+
+
+static void brw_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   free(blend);
+}
+
+static void brw_set_blend_color( struct pipe_context *pipe,
+			     const struct pipe_blend_color *blend_color )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.BlendColor = *blend_color;
+
+   brw->state.dirty.brw |= BRW_NEW_BLEND;
+}
+
+/************************************************************************
+ * Sampler 
+ */
+
+static void *
+brw_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   DUP( pipe_sampler_state, sampler );
+}
+
+static void brw_bind_sampler_states(struct pipe_context *pipe,
+                                    unsigned num, void **sampler)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == brw->num_samplers &&
+       !memcmp(brw->attribs.Samplers, sampler, num * sizeof(void *)))
+      return;
+
+   memcpy(brw->attribs.Samplers, sampler, num * sizeof(void *));
+   memset(&brw->attribs.Samplers[num], 0, (PIPE_MAX_SAMPLERS - num) *
+          sizeof(void *));
+
+   brw->num_samplers = num;
+
+   brw->state.dirty.brw |= BRW_NEW_SAMPLER;
+}
+
+static void brw_delete_sampler_state(struct pipe_context *pipe,
+                                      void *sampler)
+{
+   free(sampler);
+}
+
+
+/************************************************************************
+ * Depth stencil 
+ */
+
+static void *
+brw_create_depth_stencil_state(struct pipe_context *pipe,
+                           const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   DUP( pipe_depth_stencil_alpha_state, depth_stencil );
+}
+
+static void brw_bind_depth_stencil_state(struct pipe_context *pipe,
+                                         void *depth_stencil)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.DepthStencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+
+   brw->state.dirty.brw |= BRW_NEW_DEPTH_STENCIL;
+}
+
+static void brw_delete_depth_stencil_state(struct pipe_context *pipe,
+                                           void *depth_stencil)
+{
+   free(depth_stencil);
+}
+
+/************************************************************************
+ * Scissor
+ */
+static void brw_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   memcpy( &brw->attribs.Scissor, scissor, sizeof(*scissor) );
+   brw->state.dirty.brw |= BRW_NEW_SCISSOR;
+}
+
+
+/************************************************************************
+ * Stipple
+ */
+
+static void brw_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+}
+
+
+/************************************************************************
+ * Fragment shader
+ */
+
+static void * brw_create_fs_state(struct pipe_context *pipe,
+                                   const struct pipe_shader_state *shader)
+{
+   struct brw_fragment_program *brw_fp = CALLOC_STRUCT(brw_fragment_program);
+
+   brw_fp->program.tokens = tgsi_dup_tokens(shader->tokens);
+   brw_fp->id = brw_context(pipe)->program_id++;
+
+   tgsi_scan_shader(shader->tokens, &brw_fp->info);
+
+#if 0
+   brw_shader_info(shader->tokens,
+		   &brw_fp->info2);
+#endif
+
+   tgsi_dump(shader->tokens, 0);
+
+
+   return (void *)brw_fp;
+}
+
+static void brw_bind_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.FragmentProgram = (struct brw_fragment_program *)shader;
+   brw->state.dirty.brw |= BRW_NEW_FS;
+}
+
+static void brw_delete_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct brw_fragment_program *brw_fp = (struct brw_fragment_program *) shader;
+
+   FREE((void *) brw_fp->program.tokens);
+   FREE(brw_fp);
+}
+
+
+/************************************************************************
+ * Vertex shader and other TNL state 
+ */
+
+static void *brw_create_vs_state(struct pipe_context *pipe,
+                                 const struct pipe_shader_state *shader)
+{
+   struct brw_vertex_program *brw_vp = CALLOC_STRUCT(brw_vertex_program);
+
+   brw_vp->program.tokens = tgsi_dup_tokens(shader->tokens);
+   brw_vp->id = brw_context(pipe)->program_id++;
+
+   tgsi_scan_shader(shader->tokens, &brw_vp->info);
+
+#if 0
+   brw_shader_info(shader->tokens,
+		   &brw_vp->info2);
+#endif
+   tgsi_dump(shader->tokens, 0);
+
+   return (void *)brw_vp;
+}
+
+static void brw_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.VertexProgram = (struct brw_vertex_program *)vs;
+   brw->state.dirty.brw |= BRW_NEW_VS;
+
+   debug_printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
+}
+
+static void brw_delete_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct brw_vertex_program *brw_vp = (struct brw_vertex_program *) shader;
+
+   FREE((void *) brw_vp->program.tokens);
+   FREE(brw_vp);
+}
+
+
+static void brw_set_clip_state( struct pipe_context *pipe,
+                                const struct pipe_clip_state *clip )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Clip = *clip;
+}
+
+
+static void brw_set_viewport_state( struct pipe_context *pipe,
+				     const struct pipe_viewport_state *viewport )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Viewport = *viewport; /* struct copy */
+   brw->state.dirty.brw |= BRW_NEW_VIEWPORT;
+
+   /* pass the viewport info to the draw module */
+   //draw_set_viewport_state(brw->draw, viewport);
+}
+
+
+static void brw_set_vertex_buffers(struct pipe_context *pipe,
+				   unsigned count,
+				   const struct pipe_vertex_buffer *buffers)
+{
+   struct brw_context *brw = brw_context(pipe);
+   memcpy(brw->vb.vbo_array, buffers, count * sizeof(buffers[0]));
+}
+
+static void brw_set_vertex_elements(struct pipe_context *pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_element *elements)
+{
+   /* flush ? */
+   struct brw_context *brw = brw_context(pipe);
+   uint i;
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   for (i = 0; i < count; i++) {
+      struct brw_vertex_element_state el;
+      memset(&el, 0, sizeof(el));
+
+      el.ve0.src_offset = elements[i].src_offset;
+      el.ve0.src_format = brw_translate_surface_format(elements[i].src_format);
+      el.ve0.valid = 1;
+      el.ve0.vertex_buffer_index = elements[i].vertex_buffer_index;
+
+      el.ve1.dst_offset   = i * 4;
+
+      el.ve1.vfcomponent3 = BRW_VFCOMPONENT_STORE_SRC;
+      el.ve1.vfcomponent2 = BRW_VFCOMPONENT_STORE_SRC;
+      el.ve1.vfcomponent1 = BRW_VFCOMPONENT_STORE_SRC;
+      el.ve1.vfcomponent0 = BRW_VFCOMPONENT_STORE_SRC;
+
+      switch (elements[i].nr_components) {
+      case 1: el.ve1.vfcomponent1 = BRW_VFCOMPONENT_STORE_0;
+      case 2: el.ve1.vfcomponent2 = BRW_VFCOMPONENT_STORE_0;
+      case 3: el.ve1.vfcomponent3 = BRW_VFCOMPONENT_STORE_1_FLT;
+         break;
+      }
+
+      brw->vb.inputs[i] = el;
+   }
+}
+
+
+
+/************************************************************************
+ * Constant buffers
+ */
+
+static void brw_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     const struct pipe_constant_buffer *buf)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   assert(buf == 0 || index == 0);
+
+   brw->attribs.Constants[shader] = buf;
+   brw->state.dirty.brw |= BRW_NEW_CONSTANTS;
+}
+
+
+/************************************************************************
+ * Texture surfaces
+ */
+
+
+static void brw_set_sampler_textures(struct pipe_context *pipe,
+                                     unsigned num,
+                                     struct pipe_texture **texture)
+{
+   struct brw_context *brw = brw_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == brw->num_textures &&
+       !memcmp(brw->attribs.Texture, texture, num *
+               sizeof(struct pipe_texture *)))
+      return;
+
+   for (i = 0; i < num; i++)
+      pipe_texture_reference((struct pipe_texture **) &brw->attribs.Texture[i],
+                             texture[i]);
+
+   for (i = num; i < brw->num_textures; i++)
+      pipe_texture_reference((struct pipe_texture **) &brw->attribs.Texture[i],
+                             NULL);
+
+   brw->num_textures = num;
+
+   brw->state.dirty.brw |= BRW_NEW_TEXTURE;
+}
+
+
+/************************************************************************
+ * Render targets, etc
+ */
+
+static void brw_set_framebuffer_state(struct pipe_context *pipe,
+				       const struct pipe_framebuffer_state *fb)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.FrameBuffer = *fb; /* struct copy */
+
+   brw->state.dirty.brw |= BRW_NEW_FRAMEBUFFER;
+}
+
+
+
+/************************************************************************
+ * Rasterizer state
+ */
+
+static void *
+brw_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   DUP(pipe_rasterizer_state, rasterizer);
+}
+
+static void brw_bind_rasterizer_state( struct pipe_context *pipe,
+                                        void *setup )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Raster = (struct pipe_rasterizer_state *)setup;
+
+   /* Also pass-through to draw module:
+    */
+   //draw_set_rasterizer_state(brw->draw, setup);
+
+   brw->state.dirty.brw |= BRW_NEW_RASTERIZER;
+}
+
+static void brw_delete_rasterizer_state(struct pipe_context *pipe,
+                                         void *setup)
+{
+   free(setup);
+}
+
+
+
+void
+brw_init_state_functions( struct brw_context *brw )
+{
+   brw->pipe.create_blend_state = brw_create_blend_state;
+   brw->pipe.bind_blend_state = brw_bind_blend_state;
+   brw->pipe.delete_blend_state = brw_delete_blend_state;
+
+   brw->pipe.create_sampler_state = brw_create_sampler_state;
+   brw->pipe.bind_sampler_states = brw_bind_sampler_states;
+   brw->pipe.delete_sampler_state = brw_delete_sampler_state;
+
+   brw->pipe.create_depth_stencil_alpha_state = brw_create_depth_stencil_state;
+   brw->pipe.bind_depth_stencil_alpha_state = brw_bind_depth_stencil_state;
+   brw->pipe.delete_depth_stencil_alpha_state = brw_delete_depth_stencil_state;
+
+   brw->pipe.create_rasterizer_state = brw_create_rasterizer_state;
+   brw->pipe.bind_rasterizer_state = brw_bind_rasterizer_state;
+   brw->pipe.delete_rasterizer_state = brw_delete_rasterizer_state;
+   brw->pipe.create_fs_state = brw_create_fs_state;
+   brw->pipe.bind_fs_state = brw_bind_fs_state;
+   brw->pipe.delete_fs_state = brw_delete_fs_state;
+   brw->pipe.create_vs_state = brw_create_vs_state;
+   brw->pipe.bind_vs_state = brw_bind_vs_state;
+   brw->pipe.delete_vs_state = brw_delete_vs_state;
+
+   brw->pipe.set_blend_color = brw_set_blend_color;
+   brw->pipe.set_clip_state = brw_set_clip_state;
+   brw->pipe.set_constant_buffer = brw_set_constant_buffer;
+   brw->pipe.set_framebuffer_state = brw_set_framebuffer_state;
+
+//   brw->pipe.set_feedback_state = brw_set_feedback_state;
+//   brw->pipe.set_feedback_buffer = brw_set_feedback_buffer;
+
+   brw->pipe.set_polygon_stipple = brw_set_polygon_stipple;
+   brw->pipe.set_scissor_state = brw_set_scissor_state;
+   brw->pipe.set_sampler_textures = brw_set_sampler_textures;
+   brw->pipe.set_viewport_state = brw_set_viewport_state;
+   brw->pipe.set_vertex_buffers = brw_set_vertex_buffers;
+   brw->pipe.set_vertex_elements = brw_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/i965simple/brw_state.h b/src/gallium/drivers/i965simple/brw_state.h
new file mode 100644
index 0000000000..de0a6371b8
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state.h
@@ -0,0 +1,151 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#ifndef BRW_STATE_H
+#define BRW_STATE_H
+
+#include "brw_context.h"
+#include "brw_winsys.h"
+
+
+const struct brw_tracked_state brw_blend_constant_color;
+const struct brw_tracked_state brw_cc_unit;
+const struct brw_tracked_state brw_cc_vp;
+const struct brw_tracked_state brw_clip_prog;
+const struct brw_tracked_state brw_clip_unit;
+const struct brw_tracked_state brw_constant_buffer_state;
+const struct brw_tracked_state brw_constant_buffer;
+const struct brw_tracked_state brw_curbe_offsets;
+const struct brw_tracked_state brw_invarient_state;
+const struct brw_tracked_state brw_gs_prog;
+const struct brw_tracked_state brw_gs_unit;
+const struct brw_tracked_state brw_drawing_rect;
+const struct brw_tracked_state brw_line_stipple;
+const struct brw_tracked_state brw_pipelined_state_pointers;
+const struct brw_tracked_state brw_binding_table_pointers;
+const struct brw_tracked_state brw_depthbuffer;
+const struct brw_tracked_state brw_polygon_stipple_offset;
+const struct brw_tracked_state brw_polygon_stipple;
+const struct brw_tracked_state brw_program_parameters;
+const struct brw_tracked_state brw_recalculate_urb_fence;
+const struct brw_tracked_state brw_sf_prog;
+const struct brw_tracked_state brw_sf_unit;
+const struct brw_tracked_state brw_sf_vp;
+const struct brw_tracked_state brw_state_base_address;
+const struct brw_tracked_state brw_urb_fence;
+const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_prog;
+const struct brw_tracked_state brw_vs_unit;
+const struct brw_tracked_state brw_wm_prog;
+const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_surfaces;
+const struct brw_tracked_state brw_wm_unit;
+
+const struct brw_tracked_state brw_psp_urb_cbs;
+
+const struct brw_tracked_state brw_active_vertprog;
+const struct brw_tracked_state brw_tnl_vertprog;
+const struct brw_tracked_state brw_pipe_control;
+
+const struct brw_tracked_state brw_clear_surface_cache;
+const struct brw_tracked_state brw_clear_batch_cache;
+
+/***********************************************************************
+ * brw_state_cache.c
+ */
+unsigned brw_cache_data(struct brw_cache *cache,
+		      const void *data );
+
+unsigned brw_cache_data_sz(struct brw_cache *cache,
+			 const void *data,
+			 unsigned data_sz);
+
+unsigned brw_upload_cache( struct brw_cache *cache,
+			 const void *key,
+			 unsigned key_sz,
+			 const void *data,
+			 unsigned data_sz,
+			 const void *aux,
+			 void *aux_return );
+
+boolean brw_search_cache( struct brw_cache *cache,
+			    const void *key,
+			    unsigned key_size,
+			    void *aux_return,
+			    unsigned *offset_return);
+
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
+
+static inline struct pipe_buffer *brw_cache_buffer(struct brw_context *brw,
+                                                          enum brw_cache_id id)
+{
+   return brw->cache[id].pool->buffer;
+}
+
+/***********************************************************************
+ * brw_state_batch.c
+ */
+#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
+
+boolean brw_cached_batch_struct( struct brw_context *brw,
+				   const void *data,
+				   unsigned sz );
+
+void brw_destroy_batch_cache( struct brw_context *brw );
+
+
+/***********************************************************************
+ * brw_state_pool.c
+ */
+void brw_init_pools( struct brw_context *brw );
+void brw_destroy_pools( struct brw_context *brw );
+
+boolean brw_pool_alloc( struct brw_mem_pool *pool,
+			  unsigned size,
+			  unsigned alignment,
+			  unsigned *offset_return);
+
+void brw_pool_fence( struct brw_context *brw,
+		     struct brw_mem_pool *pool,
+		     unsigned fence );
+
+
+void brw_pool_check_wrap( struct brw_context *brw,
+			  struct brw_mem_pool *pool );
+
+void brw_clear_all_caches( struct brw_context *brw );
+void brw_invalidate_pools( struct brw_context *brw );
+void brw_clear_batch_cache_flush( struct brw_context *brw );
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_state_batch.c b/src/gallium/drivers/i965simple/brw_state_batch.c
new file mode 100644
index 0000000000..43a1c89fc4
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_batch.c
@@ -0,0 +1,113 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_state.h"
+#include "brw_winsys.h"
+
+#include "util/u_memory.h"
+
+/* A facility similar to the data caching code above, which aims to
+ * prevent identical commands being issued repeatedly.
+ */
+boolean brw_cached_batch_struct( struct brw_context *brw,
+                                 const void *data,
+                                 unsigned sz )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+   struct header *newheader = (struct header *)data;
+
+   if (brw->emit_state_always) {
+      brw_batchbuffer_data(brw->winsys, data, sz);
+      return TRUE;
+   }
+
+   while (item) {
+      if (item->header->opcode == newheader->opcode) {
+	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
+	    return FALSE;
+	 if (item->sz != sz) {
+	    FREE(item->header);
+	    item->header = MALLOC(sz);
+	    item->sz = sz;
+	 }
+	 goto emit;
+      }
+      item = item->next;
+   }
+
+   assert(!item);
+   item = CALLOC_STRUCT(brw_cached_batch_item);
+   item->header = MALLOC(sz);
+   item->sz = sz;
+   item->next = brw->cached_batch_items;
+   brw->cached_batch_items = item;
+
+emit:
+   memcpy(item->header, newheader, sz);
+   brw_batchbuffer_data(brw->winsys, data, sz);
+   return TRUE;
+}
+
+static void clear_batch_cache( struct brw_context *brw )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+
+   while (item) {
+      struct brw_cached_batch_item *next = item->next;
+      free((void *)item->header);
+      free(item);
+      item = next;
+   }
+
+   brw->cached_batch_items = NULL;
+
+
+   brw_clear_all_caches(brw);
+
+   brw_invalidate_pools(brw);
+}
+
+void brw_clear_batch_cache_flush( struct brw_context *brw )
+{
+   clear_batch_cache(brw);
+
+/*    brw_do_flush(brw, BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE); */
+
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+}
+
+
+
+void brw_destroy_batch_cache( struct brw_context *brw )
+{
+   clear_batch_cache(brw);
+}
diff --git a/src/gallium/drivers/i965simple/brw_state_cache.c b/src/gallium/drivers/i965simple/brw_state_cache.c
new file mode 100644
index 0000000000..094248fa69
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_cache.c
@@ -0,0 +1,443 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_state.h"
+
+#include "brw_wm.h"
+#include "brw_vs.h"
+#include "brw_clip.h"
+#include "brw_sf.h"
+#include "brw_gs.h"
+
+#include "util/u_memory.h"
+
+
+
+/***********************************************************************
+ * Check cache for uploaded version of struct, else upload new one.
+ * Fail when memory is exhausted.
+ *
+ * XXX: FIXME: Currently search is so slow it would be quicker to
+ * regenerate the data every time...
+ */
+
+static unsigned hash_key( const void *key, unsigned key_size )
+{
+   unsigned *ikey = (unsigned *)key;
+   unsigned hash = 0, i;
+
+   assert(key_size % 4 == 0);
+
+   /* I'm sure this can be improved on:
+    */
+   for (i = 0; i < key_size/4; i++)
+      hash ^= ikey[i];
+
+   return hash;
+}
+
+static struct brw_cache_item *search_cache( struct brw_cache *cache,
+					     unsigned hash,
+					     const void *key,
+					     unsigned key_size)
+{
+   struct brw_cache_item *c;
+
+   for (c = cache->items[hash % cache->size]; c; c = c->next) {
+      if (c->hash == hash &&
+	  c->key_size == key_size &&
+	  memcmp(c->key, key, key_size) == 0)
+	 return c;
+   }
+
+   return NULL;
+}
+
+
+static void rehash( struct brw_cache *cache )
+{
+   struct brw_cache_item **items;
+   struct brw_cache_item *c, *next;
+   unsigned size, i;
+
+   size = cache->size * 3;
+   items = (struct brw_cache_item**) MALLOC(size * sizeof(*items));
+   memset(items, 0, size * sizeof(*items));
+
+   for (i = 0; i < cache->size; i++)
+      for (c = cache->items[i]; c; c = next) {
+	 next = c->next;
+	 c->next = items[c->hash % size];
+	 items[c->hash % size] = c;
+      }
+
+   FREE(cache->items);
+   cache->items = items;
+   cache->size = size;
+}
+
+
+boolean brw_search_cache( struct brw_cache *cache,
+			    const void *key,
+			    unsigned key_size,
+			    void *aux_return,
+			    unsigned *offset_return)
+{
+   struct brw_cache_item *item;
+   unsigned addr = 0;
+   unsigned hash = hash_key(key, key_size);
+
+   item = search_cache(cache, hash, key, key_size);
+
+   if (item) {
+      if (aux_return)
+	 *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+
+      *offset_return = addr = item->offset;
+   }
+
+   if (item == NULL || addr != cache->last_addr) {
+      cache->brw->state.dirty.cache |= 1<<cache->id;
+      cache->last_addr = addr;
+   }
+
+   return item != NULL;
+}
+
+unsigned brw_upload_cache( struct brw_cache *cache,
+			 const void *key,
+			 unsigned key_size,
+			 const void *data,
+			 unsigned data_size,
+			 const void *aux,
+			 void *aux_return )
+{
+   unsigned offset;
+   struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
+   unsigned hash = hash_key(key, key_size);
+   void *tmp = MALLOC(key_size + cache->aux_size);
+
+   if (!brw_pool_alloc(cache->pool, data_size, 1 << 6, &offset)) {
+      /* Should not be possible:
+       */
+      debug_printf("brw_pool_alloc failed\n");
+      exit(1);
+   }
+
+   memcpy(tmp, key, key_size);
+
+   if (cache->aux_size)
+      memcpy(tmp+key_size, aux, cache->aux_size);
+
+   item->key = tmp;
+   item->hash = hash;
+   item->key_size = key_size;
+   item->offset = offset;
+   item->data_size = data_size;
+
+   if (++cache->n_items > cache->size * 1.5)
+      rehash(cache);
+
+   hash %= cache->size;
+   item->next = cache->items[hash];
+   cache->items[hash] = item;
+
+   if (aux_return) {
+      assert(cache->aux_size);
+      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+   }
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("upload %s: %d bytes to pool buffer %p offset %x\n",
+             cache->name, 
+	     data_size,
+             (void*)cache->pool->buffer,
+             offset);
+
+   /* Copy data to the buffer:
+    */
+   cache->brw->winsys->buffer_subdata_typed(cache->brw->winsys,
+					    cache->pool->buffer, 
+					    offset, 
+					    data_size, 
+					    data,
+					    cache->id);
+
+   cache->brw->state.dirty.cache |= 1<<cache->id;
+   cache->last_addr = offset;
+
+   return offset;
+}
+
+/* This doesn't really work with aux data.  Use search/upload instead
+ */
+unsigned brw_cache_data_sz(struct brw_cache *cache,
+			 const void *data,
+			 unsigned data_size)
+{
+   unsigned addr;
+
+   if (!brw_search_cache(cache, data, data_size, NULL, &addr)) {
+      addr = brw_upload_cache(cache,
+			      data, data_size,
+			      data, data_size,
+			      NULL, NULL);
+   }
+
+   return addr;
+}
+
+unsigned brw_cache_data(struct brw_cache *cache,
+		      const void *data)
+{
+   return brw_cache_data_sz(cache, data, cache->key_size);
+}
+
+enum pool_type {
+   DW_SURFACE_STATE,
+   DW_GENERAL_STATE
+};
+
+static void brw_init_cache( struct brw_context *brw,
+			    const char *name,
+			    unsigned id,
+			    unsigned key_size,
+			    unsigned aux_size,
+			    enum pool_type pool_type)
+{
+   struct brw_cache *cache = &brw->cache[id];
+   cache->brw = brw;
+   cache->id = id;
+   cache->name = name;
+   cache->items = NULL;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
+
+
+   cache->key_size = key_size;
+   cache->aux_size = aux_size;
+   switch (pool_type) {
+   case DW_GENERAL_STATE: cache->pool = &brw->pool[BRW_GS_POOL]; break;
+   case DW_SURFACE_STATE: cache->pool = &brw->pool[BRW_SS_POOL]; break;
+   default: assert(0); break;
+   }
+}
+
+void brw_init_caches( struct brw_context *brw )
+{
+
+   brw_init_cache(brw,
+		  "CC_VP",
+		  BRW_CC_VP,
+		  sizeof(struct brw_cc_viewport),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "CC_UNIT",
+		  BRW_CC_UNIT,
+		  sizeof(struct brw_cc_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "WM_PROG",
+		  BRW_WM_PROG,
+		  sizeof(struct brw_wm_prog_key),
+		  sizeof(struct brw_wm_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SAMPLER_DEFAULT_COLOR",
+		  BRW_SAMPLER_DEFAULT_COLOR,
+		  sizeof(struct brw_sampler_default_color),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SAMPLER",
+		  BRW_SAMPLER,
+		  0,		/* variable key/data size */
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "WM_UNIT",
+		  BRW_WM_UNIT,
+		  sizeof(struct brw_wm_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SF_PROG",
+		  BRW_SF_PROG,
+		  sizeof(struct brw_sf_prog_key),
+		  sizeof(struct brw_sf_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SF_VP",
+		  BRW_SF_VP,
+		  sizeof(struct brw_sf_viewport),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SF_UNIT",
+		  BRW_SF_UNIT,
+		  sizeof(struct brw_sf_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "VS_UNIT",
+		  BRW_VS_UNIT,
+		  sizeof(struct brw_vs_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "VS_PROG",
+		  BRW_VS_PROG,
+		  sizeof(struct brw_vs_prog_key),
+		  sizeof(struct brw_vs_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "CLIP_UNIT",
+		  BRW_CLIP_UNIT,
+		  sizeof(struct brw_clip_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "CLIP_PROG",
+		  BRW_CLIP_PROG,
+		  sizeof(struct brw_clip_prog_key),
+		  sizeof(struct brw_clip_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "GS_UNIT",
+		  BRW_GS_UNIT,
+		  sizeof(struct brw_gs_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "GS_PROG",
+		  BRW_GS_PROG,
+		  sizeof(struct brw_gs_prog_key),
+		  sizeof(struct brw_gs_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SS_SURFACE",
+		  BRW_SS_SURFACE,
+		  sizeof(struct brw_surface_state),
+		  0,
+		  DW_SURFACE_STATE);
+
+   brw_init_cache(brw,
+		  "SS_SURF_BIND",
+		  BRW_SS_SURF_BIND,
+		  sizeof(struct brw_surface_binding_table),
+		  0,
+		  DW_SURFACE_STATE);
+}
+
+
+/* When we lose hardware context, need to invalidate the surface cache
+ * as these structs must be explicitly re-uploaded.  They are subject
+ * to fixup by the memory manager as they contain absolute agp
+ * offsets, so we need to ensure there is a fresh version of the
+ * struct available to receive the fixup.
+ *
+ * XXX: Need to ensure that there aren't two versions of a surface or
+ * bufferobj with different backing data active in the same buffer at
+ * once?  Otherwise the cache could confuse them.  Maybe better not to
+ * cache at all?
+ *
+ * --> Isn't this the same as saying need to ensure batch is flushed
+ *         before new data is uploaded to an existing buffer?  We
+ *         already try to make sure of that.
+ */
+static void clear_cache( struct brw_cache *cache )
+{
+   struct brw_cache_item *c, *next;
+   unsigned i;
+
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
+	 next = c->next;
+	 free((void *)c->key);
+	 free(c);
+      }
+      cache->items[i] = NULL;
+   }
+
+   cache->n_items = 0;
+}
+
+void brw_clear_all_caches( struct brw_context *brw )
+{
+   int i;
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
+
+   for (i = 0; i < BRW_MAX_CACHE; i++)
+      clear_cache(&brw->cache[i]);
+
+   if (brw->curbe.last_buf) {
+      FREE(brw->curbe.last_buf);
+      brw->curbe.last_buf = NULL;
+   }
+
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+}
+
+
+
+
+
+void brw_destroy_caches( struct brw_context *brw )
+{
+   unsigned i;
+
+   for (i = 0; i < BRW_MAX_CACHE; i++)
+      clear_cache(&brw->cache[i]);
+}
diff --git a/src/gallium/drivers/i965simple/brw_state_pool.c b/src/gallium/drivers/i965simple/brw_state_pool.c
new file mode 100644
index 0000000000..e91263cb1f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_pool.c
@@ -0,0 +1,138 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+/** @file brw_state_pool.c
+ * Implements the state pool allocator.
+ *
+ * For the 965, we create two state pools for state cache entries.  Objects
+ * will be allocated into the pools depending on which state base address
+ * their pointer is relative to in other 965 state.
+ *
+ * The state pools are relatively simple: As objects are allocated, increment
+ * the offset to allocate space.  When the pool is "full" (rather, close to
+ * full), we reset the pool and reset the state cache entries that point into
+ * the pool.
+ */
+
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+
+boolean brw_pool_alloc( struct brw_mem_pool *pool,
+			  unsigned size,
+			  unsigned alignment,
+			  unsigned *offset_return)
+{
+   unsigned fixup = align(pool->offset, alignment) - pool->offset;
+
+   size = align(size, 4);
+
+   if (pool->offset + fixup + size >= pool->size) {
+      debug_printf("%s failed\n", __FUNCTION__);
+      assert(0);
+      exit(0);
+   }
+
+   pool->offset += fixup;
+   *offset_return = pool->offset;
+   pool->offset += size;
+
+   return TRUE;
+}
+
+static
+void brw_invalidate_pool( struct brw_mem_pool *pool )
+{
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("\n\n\n %s \n\n\n", __FUNCTION__);
+
+   pool->offset = 0;
+
+   brw_clear_all_caches(pool->brw);
+}
+
+
+static void brw_init_pool( struct brw_context *brw,
+			   unsigned pool_id,
+			   unsigned size )
+{
+   struct brw_mem_pool *pool = &brw->pool[pool_id];
+
+   pool->size = size;
+   pool->brw = brw;
+
+   pool->buffer = pipe_buffer_create(brw->pipe.screen,
+                                     4096,
+                                     0 /*  DRM_BO_FLAG_MEM_TT */,
+                                     size);
+}
+
+static void brw_destroy_pool( struct brw_context *brw,
+			      unsigned pool_id )
+{
+   struct brw_mem_pool *pool = &brw->pool[pool_id];
+
+   pipe_buffer_reference( pool->brw->pipe.screen,
+			  &pool->buffer,
+			  NULL );
+}
+
+
+void brw_pool_check_wrap( struct brw_context *brw,
+			  struct brw_mem_pool *pool )
+{
+   if (pool->offset > (pool->size * 3) / 4) {
+      brw->state.dirty.brw |= BRW_NEW_SCENE;
+   }
+
+}
+
+void brw_init_pools( struct brw_context *brw )
+{
+   brw_init_pool(brw, BRW_GS_POOL, 0x80000);
+   brw_init_pool(brw, BRW_SS_POOL, 0x80000);
+}
+
+void brw_destroy_pools( struct brw_context *brw )
+{
+   brw_destroy_pool(brw, BRW_GS_POOL);
+   brw_destroy_pool(brw, BRW_SS_POOL);
+}
+
+
+void brw_invalidate_pools( struct brw_context *brw )
+{
+   brw_invalidate_pool(&brw->pool[BRW_GS_POOL]);
+   brw_invalidate_pool(&brw->pool[BRW_SS_POOL]);
+}
diff --git a/src/gallium/drivers/i965simple/brw_state_upload.c b/src/gallium/drivers/i965simple/brw_state_upload.c
new file mode 100644
index 0000000000..bac9161b5f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_upload.c
@@ -0,0 +1,202 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+
+#include "util/u_memory.h"
+
+/* This is used to initialize brw->state.atoms[].  We could use this
+ * list directly except for a single atom, brw_constant_buffer, which
+ * has a .dirty value which changes according to the parameters of the
+ * current fragment and vertex programs, and so cannot be a static
+ * value.
+ */
+const struct brw_tracked_state *atoms[] =
+{
+   &brw_vs_prog,
+   &brw_gs_prog,
+   &brw_clip_prog,
+   &brw_sf_prog,
+   &brw_wm_prog,
+
+   /* Once all the programs are done, we know how large urb entry
+    * sizes need to be and can decide if we need to change the urb
+    * layout.
+    */
+   &brw_curbe_offsets,
+   &brw_recalculate_urb_fence,
+
+
+   &brw_cc_vp,
+   &brw_cc_unit,
+
+   &brw_wm_surfaces,		/* must do before samplers */
+   &brw_wm_samplers,
+
+   &brw_wm_unit,
+   &brw_sf_vp,
+   &brw_sf_unit,
+   &brw_vs_unit,		/* always required, enabled or not */
+   &brw_clip_unit,
+   &brw_gs_unit,
+
+   /* Command packets:
+    */
+   &brw_invarient_state,
+   &brw_state_base_address,
+   &brw_pipe_control,
+
+   &brw_binding_table_pointers,
+   &brw_blend_constant_color,
+
+   &brw_drawing_rect,
+   &brw_depthbuffer,
+
+   &brw_polygon_stipple,
+   &brw_line_stipple,
+
+   &brw_psp_urb_cbs,
+
+   &brw_constant_buffer
+};
+
+
+void brw_init_state( struct brw_context *brw )
+{
+   brw_init_pools(brw);
+   brw_init_caches(brw);
+
+   brw->state.dirty.brw = ~0;
+   brw->emit_state_always = 0;
+}
+
+
+void brw_destroy_state( struct brw_context *brw )
+{
+   brw_destroy_caches(brw);
+   brw_destroy_batch_cache(brw);
+   brw_destroy_pools(brw);
+}
+
+/***********************************************************************
+ */
+
+static boolean check_state( const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   return ((a->brw & b->brw) ||
+	   (a->cache & b->cache));
+}
+
+static void accumulate_state( struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   a->brw |= b->brw;
+   a->cache |= b->cache;
+}
+
+
+static void xor_states( struct brw_state_flags *result,
+			     const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   result->brw = a->brw ^ b->brw;
+   result->cache = a->cache ^ b->cache;
+}
+
+
+/***********************************************************************
+ * Emit all state:
+ */
+void brw_validate_state( struct brw_context *brw )
+{
+   struct brw_state_flags *state = &brw->state.dirty;
+   unsigned i;
+
+   if (brw->emit_state_always) 
+      state->brw |= ~0;
+
+   if (state->cache == 0 &&
+       state->brw == 0)
+      return;
+
+   if (brw->state.dirty.brw & BRW_NEW_SCENE)
+      brw_clear_batch_cache_flush(brw);
+
+   if (BRW_DEBUG) {
+      /* Debug version which enforces various sanity checks on the
+       * state flags which are generated and checked to help ensure
+       * state atoms are ordered correctly in the list.
+       */
+      struct brw_state_flags examined, prev;
+      memset(&examined, 0, sizeof(examined));
+      prev = *state;
+
+      for (i = 0; i < Elements(atoms); i++) {
+	 const struct brw_tracked_state *atom = atoms[i];
+	 struct brw_state_flags generated;
+
+	 assert(atom->dirty.brw ||
+		atom->dirty.cache);
+	 assert(atom->update);
+
+	 if (check_state(state, &atom->dirty)) {
+	    atom->update( brw );
+	 }
+
+	 accumulate_state(&examined, &atom->dirty);
+
+	 /* generated = (prev ^ state)
+	  * if (examined & generated)
+	  *     fail;
+	  */
+	 xor_states(&generated, &prev, state);
+	 assert(!check_state(&examined, &generated));
+	 prev = *state;
+      }
+   }
+   else {
+      for (i = 0; i < Elements(atoms); i++) {
+	 const struct brw_tracked_state *atom = atoms[i];
+
+	 assert(atom->dirty.brw ||
+		atom->dirty.cache);
+	 assert(atom->update);
+
+	 if (check_state(state, &atom->dirty))
+	    atom->update( brw );
+      }
+   }
+
+   memset(state, 0, sizeof(*state));
+}
diff --git a/src/gallium/drivers/i965simple/brw_structs.h b/src/gallium/drivers/i965simple/brw_structs.h
new file mode 100644
index 0000000000..bbb087e95d
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_structs.h
@@ -0,0 +1,1348 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_STRUCTS_H
+#define BRW_STRUCTS_H
+
+#include "pipe/p_compiler.h"
+
+/* Command packets:
+ */
+struct header
+{
+   unsigned length:16;
+   unsigned opcode:16;
+};
+
+
+union header_union
+{
+   struct header bits;
+   unsigned dword;
+};
+
+struct brw_3d_control
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned notify_enable:1;
+      unsigned pad:3;
+      unsigned wc_flush_enable:1;
+      unsigned depth_stall_enable:1;
+      unsigned operation:2;
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned pad:2;
+      unsigned dest_addr_type:1;
+      unsigned dest_addr:29;
+   } dest;
+
+   unsigned dword2;
+   unsigned dword3;
+};
+
+
+struct brw_3d_primitive
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned pad:2;
+      unsigned topology:5;
+      unsigned indexed:1;
+      unsigned opcode:16;
+   } header;
+
+   unsigned verts_per_instance;
+   unsigned start_vert_location;
+   unsigned instance_count;
+   unsigned start_instance_location;
+   unsigned base_vert_location;
+};
+
+/* These seem to be passed around as function args, so it works out
+ * better to keep them as #defines:
+ */
+#define BRW_FLUSH_READ_CACHE           0x1
+#define BRW_FLUSH_STATE_CACHE          0x2
+#define BRW_INHIBIT_FLUSH_RENDER_CACHE 0x4
+#define BRW_FLUSH_SNAPSHOT_COUNTERS    0x8
+
+struct brw_mi_flush
+{
+   unsigned flags:4;
+   unsigned pad:12;
+   unsigned opcode:16;
+};
+
+struct brw_vf_statistics
+{
+   unsigned statistics_enable:1;
+   unsigned pad:15;
+   unsigned opcode:16;
+};
+
+
+
+struct brw_binding_table_pointers
+{
+   struct header header;
+   unsigned vs;
+   unsigned gs;
+   unsigned clp;
+   unsigned sf;
+   unsigned wm;
+};
+
+
+struct brw_blend_constant_color
+{
+   struct header header;
+   float blend_constant_color[4];
+};
+
+
+struct brw_depthbuffer
+{
+   union header_union header;
+
+   union {
+      struct {
+	 unsigned pitch:18;
+	 unsigned format:3;
+	 unsigned pad:4;
+	 unsigned depth_offset_disable:1;
+	 unsigned tile_walk:1;
+	 unsigned tiled_surface:1;
+	 unsigned pad2:1;
+	 unsigned surface_type:3;
+      } bits;
+      unsigned dword;
+   } dword1;
+
+   unsigned dword2_base_addr;
+
+   union {
+      struct {
+	 unsigned pad:1;
+	 unsigned mipmap_layout:1;
+	 unsigned lod:4;
+	 unsigned width:13;
+	 unsigned height:13;
+      } bits;
+      unsigned dword;
+   } dword3;
+
+   union {
+      struct {
+	 unsigned pad:12;
+	 unsigned min_array_element:9;
+	 unsigned depth:11;
+      } bits;
+      unsigned dword;
+   } dword4;
+};
+
+struct brw_drawrect
+{
+   struct header header;
+   unsigned xmin:16;
+   unsigned ymin:16;
+   unsigned xmax:16;
+   unsigned ymax:16;
+   unsigned xorg:16;
+   unsigned yorg:16;
+};
+
+
+
+
+struct brw_global_depth_offset_clamp
+{
+   struct header header;
+   float depth_offset_clamp;
+};
+
+struct brw_indexbuffer
+{
+   union {
+      struct
+      {
+	 unsigned length:8;
+	 unsigned index_format:2;
+	 unsigned cut_index_enable:1;
+	 unsigned pad:5;
+	 unsigned opcode:16;
+      } bits;
+      unsigned dword;
+
+   } header;
+
+   unsigned buffer_start;
+   unsigned buffer_end;
+};
+
+
+struct brw_line_stipple
+{
+   struct header header;
+
+   struct
+   {
+      unsigned pattern:16;
+      unsigned pad:16;
+   } bits0;
+
+   struct
+   {
+      unsigned repeat_count:9;
+      unsigned pad:7;
+      unsigned inverse_repeat_count:16;
+   } bits1;
+};
+
+
+struct brw_pipelined_state_pointers
+{
+   struct header header;
+
+   struct {
+      unsigned pad:5;
+      unsigned offset:27;
+   } vs;
+
+   struct
+   {
+      unsigned enable:1;
+      unsigned pad:4;
+      unsigned offset:27;
+   } gs;
+
+   struct
+   {
+      unsigned enable:1;
+      unsigned pad:4;
+      unsigned offset:27;
+   } clp;
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned offset:27;
+   } sf;
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned offset:27;
+   } wm;
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned offset:27; /* KW: check me! */
+   } cc;
+};
+
+
+struct brw_polygon_stipple_offset
+{
+   struct header header;
+
+   struct {
+      unsigned y_offset:5;
+      unsigned pad:3;
+      unsigned x_offset:5;
+      unsigned pad0:19;
+   } bits0;
+};
+
+
+
+struct brw_polygon_stipple
+{
+   struct header header;
+   unsigned stipple[32];
+};
+
+
+
+struct brw_pipeline_select
+{
+   struct
+   {
+      unsigned pipeline_select:1;
+      unsigned pad:15;
+      unsigned opcode:16;
+   } header;
+};
+
+
+struct brw_pipe_control
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned notify_enable:1;
+      unsigned pad:2;
+      unsigned instruction_state_cache_flush_enable:1;
+      unsigned write_cache_flush_enable:1;
+      unsigned depth_stall_enable:1;
+      unsigned post_sync_operation:2;
+
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned pad:2;
+      unsigned dest_addr_type:1;
+      unsigned dest_addr:29;
+   } bits1;
+
+   unsigned data0;
+   unsigned data1;
+};
+
+
+struct brw_urb_fence
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned vs_realloc:1;
+      unsigned gs_realloc:1;
+      unsigned clp_realloc:1;
+      unsigned sf_realloc:1;
+      unsigned vfe_realloc:1;
+      unsigned cs_realloc:1;
+      unsigned pad:2;
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned vs_fence:10;
+      unsigned gs_fence:10;
+      unsigned clp_fence:10;
+      unsigned pad:2;
+   } bits0;
+
+   struct
+   {
+      unsigned sf_fence:10;
+      unsigned vf_fence:10;
+      unsigned cs_fence:10;
+      unsigned pad:2;
+   } bits1;
+};
+
+struct brw_constant_buffer_state /* previously brw_command_streamer */
+{
+   struct header header;
+
+   struct
+   {
+      unsigned nr_urb_entries:3;
+      unsigned pad:1;
+      unsigned urb_entry_size:5;
+      unsigned pad0:23;
+   } bits0;
+};
+
+struct brw_constant_buffer
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned valid:1;
+      unsigned pad:7;
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned buffer_length:6;
+      unsigned buffer_address:26;
+   } bits0;
+};
+
+struct brw_state_base_address
+{
+   struct header header;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:4;
+      unsigned general_state_address:27;
+   } bits0;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:4;
+      unsigned surface_state_address:27;
+   } bits1;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:4;
+      unsigned indirect_object_state_address:27;
+   } bits2;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:11;
+      unsigned general_state_upper_bound:20;
+   } bits3;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:11;
+      unsigned indirect_object_state_upper_bound:20;
+   } bits4;
+};
+
+struct brw_state_prefetch
+{
+   struct header header;
+
+   struct
+   {
+      unsigned prefetch_count:3;
+      unsigned pad:3;
+      unsigned prefetch_pointer:26;
+   } bits0;
+};
+
+struct brw_system_instruction_pointer
+{
+   struct header header;
+
+   struct
+   {
+      unsigned pad:4;
+      unsigned system_instruction_pointer:28;
+   } bits0;
+};
+
+
+
+
+/* State structs for the various fixed function units:
+ */
+
+
+struct thread0
+{
+   unsigned pad0:1;
+   unsigned grf_reg_count:3;
+   unsigned pad1:2;
+   unsigned kernel_start_pointer:26;
+};
+
+struct thread1
+{
+   unsigned ext_halt_exception_enable:1;
+   unsigned sw_exception_enable:1;
+   unsigned mask_stack_exception_enable:1;
+   unsigned timeout_exception_enable:1;
+   unsigned illegal_op_exception_enable:1;
+   unsigned pad0:3;
+   unsigned depth_coef_urb_read_offset:6;	/* WM only */
+   unsigned pad1:2;
+   unsigned floating_point_mode:1;
+   unsigned thread_priority:1;
+   unsigned binding_table_entry_count:8;
+   unsigned pad3:5;
+   unsigned single_program_flow:1;
+};
+
+struct thread2
+{
+   unsigned per_thread_scratch_space:4;
+   unsigned pad0:6;
+   unsigned scratch_space_base_pointer:22;
+};
+
+
+struct thread3
+{
+   unsigned dispatch_grf_start_reg:4;
+   unsigned urb_entry_read_offset:6;
+   unsigned pad0:1;
+   unsigned urb_entry_read_length:6;
+   unsigned pad1:1;
+   unsigned const_urb_entry_read_offset:6;
+   unsigned pad2:1;
+   unsigned const_urb_entry_read_length:6;
+   unsigned pad3:1;
+};
+
+
+
+struct brw_clip_unit_state
+{
+   struct thread0 thread0;
+   struct
+   {
+      unsigned pad0:7;
+      unsigned sw_exception_enable:1;
+      unsigned pad1:3;
+      unsigned mask_stack_exception_enable:1;
+      unsigned pad2:1;
+      unsigned illegal_op_exception_enable:1;
+      unsigned pad3:2;
+      unsigned floating_point_mode:1;
+      unsigned thread_priority:1;
+      unsigned binding_table_entry_count:8;
+      unsigned pad4:5;
+      unsigned single_program_flow:1;
+   } thread1;
+
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:9;
+      unsigned gs_output_stats:1; /* not always */
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:1; 	/* may be less */
+      unsigned pad3:6;
+   } thread4;
+
+   struct
+   {
+      unsigned pad0:13;
+      unsigned clip_mode:3;
+      unsigned userclip_enable_flags:8;
+      unsigned userclip_must_clip:1;
+      unsigned pad1:1;
+      unsigned guard_band_enable:1;
+      unsigned viewport_z_clip_enable:1;
+      unsigned viewport_xy_clip_enable:1;
+      unsigned vertex_position_space:1;
+      unsigned api_mode:1;
+      unsigned pad2:1;
+   } clip5;
+
+   struct
+   {
+      unsigned pad0:5;
+      unsigned clipper_viewport_state_ptr:27;
+   } clip6;
+
+
+   float viewport_xmin;
+   float viewport_xmax;
+   float viewport_ymin;
+   float viewport_ymax;
+};
+
+
+
+struct brw_cc_unit_state
+{
+   struct
+   {
+      unsigned pad0:3;
+      unsigned bf_stencil_pass_depth_pass_op:3;
+      unsigned bf_stencil_pass_depth_fail_op:3;
+      unsigned bf_stencil_fail_op:3;
+      unsigned bf_stencil_func:3;
+      unsigned bf_stencil_enable:1;
+      unsigned pad1:2;
+      unsigned stencil_write_enable:1;
+      unsigned stencil_pass_depth_pass_op:3;
+      unsigned stencil_pass_depth_fail_op:3;
+      unsigned stencil_fail_op:3;
+      unsigned stencil_func:3;
+      unsigned stencil_enable:1;
+   } cc0;
+
+
+   struct
+   {
+      unsigned bf_stencil_ref:8;
+      unsigned stencil_write_mask:8;
+      unsigned stencil_test_mask:8;
+      unsigned stencil_ref:8;
+   } cc1;
+
+
+   struct
+   {
+      unsigned logicop_enable:1;
+      unsigned pad0:10;
+      unsigned depth_write_enable:1;
+      unsigned depth_test_function:3;
+      unsigned depth_test:1;
+      unsigned bf_stencil_write_mask:8;
+      unsigned bf_stencil_test_mask:8;
+   } cc2;
+
+
+   struct
+   {
+      unsigned pad0:8;
+      unsigned alpha_test_func:3;
+      unsigned alpha_test:1;
+      unsigned blend_enable:1;
+      unsigned ia_blend_enable:1;
+      unsigned pad1:1;
+      unsigned alpha_test_format:1;
+      unsigned pad2:16;
+   } cc3;
+
+   struct
+   {
+      unsigned pad0:5;
+      unsigned cc_viewport_state_offset:27;
+   } cc4;
+
+   struct
+   {
+      unsigned pad0:2;
+      unsigned ia_dest_blend_factor:5;
+      unsigned ia_src_blend_factor:5;
+      unsigned ia_blend_function:3;
+      unsigned statistics_enable:1;
+      unsigned logicop_func:4;
+      unsigned pad1:11;
+      unsigned dither_enable:1;
+   } cc5;
+
+   struct
+   {
+      unsigned clamp_post_alpha_blend:1;
+      unsigned clamp_pre_alpha_blend:1;
+      unsigned clamp_range:2;
+      unsigned pad0:11;
+      unsigned y_dither_offset:2;
+      unsigned x_dither_offset:2;
+      unsigned dest_blend_factor:5;
+      unsigned src_blend_factor:5;
+      unsigned blend_function:3;
+   } cc6;
+
+   struct {
+      union {
+	 float f;
+	 ubyte ub[4];
+      } alpha_ref;
+   } cc7;
+};
+
+
+
+struct brw_sf_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:10;
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:6;
+      unsigned pad3:1;
+   } thread4;
+
+   struct
+   {
+      unsigned front_winding:1;
+      unsigned viewport_transform:1;
+      unsigned pad0:3;
+      unsigned sf_viewport_state_offset:27;
+   } sf5;
+
+   struct
+   {
+      unsigned pad0:9;
+      unsigned dest_org_vbias:4;
+      unsigned dest_org_hbias:4;
+      unsigned scissor:1;
+      unsigned disable_2x2_trifilter:1;
+      unsigned disable_zero_pix_trifilter:1;
+      unsigned point_rast_rule:2;
+      unsigned line_endcap_aa_region_width:2;
+      unsigned line_width:4;
+      unsigned fast_scissor_disable:1;
+      unsigned cull_mode:2;
+      unsigned aa_enable:1;
+   } sf6;
+
+   struct
+   {
+      unsigned point_size:11;
+      unsigned use_point_size_state:1;
+      unsigned subpixel_precision:1;
+      unsigned sprite_point:1;
+      unsigned pad0:11;
+      unsigned trifan_pv:2;
+      unsigned linestrip_pv:2;
+      unsigned tristrip_pv:2;
+      unsigned line_last_pixel_enable:1;
+   } sf7;
+
+};
+
+
+struct brw_gs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:10;
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:1;
+      unsigned pad3:6;
+   } thread4;
+
+   struct
+   {
+      unsigned sampler_count:3;
+      unsigned pad0:2;
+      unsigned sampler_state_pointer:27;
+   } gs5;
+
+
+   struct
+   {
+      unsigned max_vp_index:4;
+      unsigned pad0:26;
+      unsigned reorder_enable:1;
+      unsigned pad1:1;
+   } gs6;
+};
+
+
+struct brw_vs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:10;
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:4;
+      unsigned pad3:3;
+   } thread4;
+
+   struct
+   {
+      unsigned sampler_count:3;
+      unsigned pad0:2;
+      unsigned sampler_state_pointer:27;
+   } vs5;
+
+   struct
+   {
+      unsigned vs_enable:1;
+      unsigned vert_cache_disable:1;
+      unsigned pad0:30;
+   } vs6;
+};
+
+
+struct brw_wm_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct {
+      unsigned stats_enable:1;
+      unsigned pad0:1;
+      unsigned sampler_count:3;
+      unsigned sampler_state_pointer:27;
+   } wm4;
+
+   struct
+   {
+      unsigned enable_8_pix:1;
+      unsigned enable_16_pix:1;
+      unsigned enable_32_pix:1;
+      unsigned pad0:7;
+      unsigned legacy_global_depth_bias:1;
+      unsigned line_stipple:1;
+      unsigned depth_offset:1;
+      unsigned polygon_stipple:1;
+      unsigned line_aa_region_width:2;
+      unsigned line_endcap_aa_region_width:2;
+      unsigned early_depth_test:1;
+      unsigned thread_dispatch_enable:1;
+      unsigned program_uses_depth:1;
+      unsigned program_computes_depth:1;
+      unsigned program_uses_killpixel:1;
+      unsigned legacy_line_rast: 1;
+      unsigned pad1:1;
+      unsigned max_threads:6;
+      unsigned pad2:1;
+   } wm5;
+
+   float global_depth_offset_constant;
+   float global_depth_offset_scale;
+};
+
+struct brw_sampler_default_color {
+   float color[4];
+};
+
+struct brw_sampler_state
+{
+
+   struct
+   {
+      unsigned shadow_function:3;
+      unsigned lod_bias:11;
+      unsigned min_filter:3;
+      unsigned mag_filter:3;
+      unsigned mip_filter:2;
+      unsigned base_level:5;
+      unsigned pad:1;
+      unsigned lod_preclamp:1;
+      unsigned default_color_mode:1;
+      unsigned pad0:1;
+      unsigned disable:1;
+   } ss0;
+
+   struct
+   {
+      unsigned r_wrap_mode:3;
+      unsigned t_wrap_mode:3;
+      unsigned s_wrap_mode:3;
+      unsigned pad:3;
+      unsigned max_lod:10;
+      unsigned min_lod:10;
+   } ss1;
+
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned default_color_pointer:27;
+   } ss2;
+
+   struct
+   {
+      unsigned pad:19;
+      unsigned max_aniso:3;
+      unsigned chroma_key_mode:1;
+      unsigned chroma_key_index:2;
+      unsigned chroma_key_enable:1;
+      unsigned monochrome_filter_width:3;
+      unsigned monochrome_filter_height:3;
+   } ss3;
+};
+
+
+struct brw_clipper_viewport
+{
+   float xmin;
+   float xmax;
+   float ymin;
+   float ymax;
+};
+
+struct brw_cc_viewport
+{
+   float min_depth;
+   float max_depth;
+};
+
+struct brw_sf_viewport
+{
+   struct {
+      float m00;
+      float m11;
+      float m22;
+      float m30;
+      float m31;
+      float m32;
+   } viewport;
+
+   struct {
+      short xmin;
+      short ymin;
+      short xmax;
+      short ymax;
+   } scissor;
+};
+
+/* Documented in the subsystem/shared-functions/sampler chapter...
+ */
+struct brw_surface_state
+{
+   struct {
+      unsigned cube_pos_z:1;
+      unsigned cube_neg_z:1;
+      unsigned cube_pos_y:1;
+      unsigned cube_neg_y:1;
+      unsigned cube_pos_x:1;
+      unsigned cube_neg_x:1;
+      unsigned pad:4;
+      unsigned mipmap_layout_mode:1;
+      unsigned vert_line_stride_ofs:1;
+      unsigned vert_line_stride:1;
+      unsigned color_blend:1;
+      unsigned writedisable_blue:1;
+      unsigned writedisable_green:1;
+      unsigned writedisable_red:1;
+      unsigned writedisable_alpha:1;
+      unsigned surface_format:9;
+      unsigned data_return_format:1;
+      unsigned pad0:1;
+      unsigned surface_type:3;
+   } ss0;
+
+   struct {
+      unsigned base_addr;
+   } ss1;
+
+   struct {
+      unsigned pad:2;
+      unsigned mip_count:4;
+      unsigned width:13;
+      unsigned height:13;
+   } ss2;
+
+   struct {
+      unsigned tile_walk:1;
+      unsigned tiled_surface:1;
+      unsigned pad:1;
+      unsigned pitch:18;
+      unsigned depth:11;
+   } ss3;
+
+   struct {
+      unsigned pad:19;
+      unsigned min_array_elt:9;
+      unsigned min_lod:4;
+   } ss4;
+};
+
+
+
+struct brw_vertex_buffer_state
+{
+   struct {
+      unsigned pitch:11;
+      unsigned pad:15;
+      unsigned access_type:1;
+      unsigned vb_index:5;
+   } vb0;
+
+   unsigned start_addr;
+   unsigned max_index;
+#if 1
+   unsigned instance_data_step_rate; /* not included for sequential/random vertices? */
+#endif
+};
+
+#define BRW_VBP_MAX 17
+
+struct brw_vb_array_state {
+   struct header header;
+   struct brw_vertex_buffer_state vb[BRW_VBP_MAX];
+};
+
+
+struct brw_vertex_element_state
+{
+   struct
+   {
+      unsigned src_offset:11;
+      unsigned pad:5;
+      unsigned src_format:9;
+      unsigned pad0:1;
+      unsigned valid:1;
+      unsigned vertex_buffer_index:5;
+   } ve0;
+
+   struct
+   {
+      unsigned dst_offset:8;
+      unsigned pad:8;
+      unsigned vfcomponent3:4;
+      unsigned vfcomponent2:4;
+      unsigned vfcomponent1:4;
+      unsigned vfcomponent0:4;
+   } ve1;
+};
+
+#define BRW_VEP_MAX 18
+
+struct brw_vertex_element_packet {
+   struct header header;
+   struct brw_vertex_element_state ve[BRW_VEP_MAX]; /* note: less than _TNL_ATTRIB_MAX */
+};
+
+
+struct brw_urb_immediate {
+   unsigned opcode:4;
+   unsigned offset:6;
+   unsigned swizzle_control:2;
+   unsigned pad:1;
+   unsigned allocate:1;
+   unsigned used:1;
+   unsigned complete:1;
+   unsigned response_length:4;
+   unsigned msg_length:4;
+   unsigned msg_target:4;
+   unsigned pad1:3;
+   unsigned end_of_thread:1;
+};
+
+/* Instruction format for the execution units:
+ */
+
+struct brw_instruction
+{
+   struct
+   {
+      unsigned opcode:7;
+      unsigned pad:1;
+      unsigned access_mode:1;
+      unsigned mask_control:1;
+      unsigned dependency_control:2;
+      unsigned compression_control:2;
+      unsigned thread_control:2;
+      unsigned predicate_control:4;
+      unsigned predicate_inverse:1;
+      unsigned execution_size:3;
+      unsigned destreg__conditonalmod:4; /* destreg - send, conditionalmod - others */
+      unsigned pad0:2;
+      unsigned debug_control:1;
+      unsigned saturate:1;
+   } header;
+
+   union {
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned src1_reg_file:2;
+	 unsigned src1_reg_type:3;
+	 unsigned pad:1;
+	 unsigned dest_subreg_nr:5;
+	 unsigned dest_reg_nr:8;
+	 unsigned dest_horiz_stride:2;
+	 unsigned dest_address_mode:1;
+      } da1;
+
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned pad:6;
+	 int dest_indirect_offset:10;	/* offset against the deref'd address reg */
+	 unsigned dest_subreg_nr:3; /* subnr for the address reg a0.x */
+	 unsigned dest_horiz_stride:2;
+	 unsigned dest_address_mode:1;
+      } ia1;
+
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned src1_reg_file:2;
+	 unsigned src1_reg_type:3;
+	 unsigned pad0:1;
+	 unsigned dest_writemask:4;
+	 unsigned dest_subreg_nr:1;
+	 unsigned dest_reg_nr:8;
+	 unsigned pad1:2;
+	 unsigned dest_address_mode:1;
+      } da16;
+
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned pad0:6;
+	 unsigned dest_writemask:4;
+	 int dest_indirect_offset:6;
+	 unsigned dest_subreg_nr:3;
+	 unsigned pad1:2;
+	 unsigned dest_address_mode:1;
+      } ia16;
+   } bits1;
+
+
+   union {
+      struct
+      {
+	 unsigned src0_subreg_nr:5;
+	 unsigned src0_reg_nr:8;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_horiz_stride:2;
+	 unsigned src0_width:3;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad:6;
+      } da1;
+
+      struct
+      {
+	 int src0_indirect_offset:10;
+	 unsigned src0_subreg_nr:3;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_horiz_stride:2;
+	 unsigned src0_width:3;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad:6;
+      } ia1;
+
+      struct
+      {
+	 unsigned src0_swz_x:2;
+	 unsigned src0_swz_y:2;
+	 unsigned src0_subreg_nr:1;
+	 unsigned src0_reg_nr:8;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_swz_z:2;
+	 unsigned src0_swz_w:2;
+	 unsigned pad0:1;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad1:6;
+      } da16;
+
+      struct
+      {
+	 unsigned src0_swz_x:2;
+	 unsigned src0_swz_y:2;
+	 int src0_indirect_offset:6;
+	 unsigned src0_subreg_nr:3;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_swz_z:2;
+	 unsigned src0_swz_w:2;
+	 unsigned pad0:1;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad1:6;
+      } ia16;
+
+   } bits2;
+
+   union
+   {
+      struct
+      {
+	 unsigned src1_subreg_nr:5;
+	 unsigned src1_reg_nr:8;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad:1;
+	 unsigned src1_horiz_stride:2;
+	 unsigned src1_width:3;
+	 unsigned src1_vert_stride:4;
+	 unsigned pad0:7;
+      } da1;
+
+      struct
+      {
+	 unsigned src1_swz_x:2;
+	 unsigned src1_swz_y:2;
+	 unsigned src1_subreg_nr:1;
+	 unsigned src1_reg_nr:8;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad0:1;
+	 unsigned src1_swz_z:2;
+	 unsigned src1_swz_w:2;
+	 unsigned pad1:1;
+	 unsigned src1_vert_stride:4;
+	 unsigned pad2:7;
+      } da16;
+
+      struct
+      {
+	 int  src1_indirect_offset:10;
+	 unsigned src1_subreg_nr:3;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad0:1;
+	 unsigned src1_horiz_stride:2;
+	 unsigned src1_width:3;
+	 unsigned src1_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad1:6;
+      } ia1;
+
+      struct
+      {
+	 unsigned src1_swz_x:2;
+	 unsigned src1_swz_y:2;
+	 int  src1_indirect_offset:6;
+	 unsigned src1_subreg_nr:3;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad0:1;
+	 unsigned src1_swz_z:2;
+	 unsigned src1_swz_w:2;
+	 unsigned pad1:1;
+	 unsigned src1_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad2:6;
+      } ia16;
+
+
+      struct
+      {
+	 int  jump_count:16;	/* note: signed */
+	 unsigned  pop_count:4;
+	 unsigned  pad0:12;
+      } if_else;
+
+      struct {
+	 unsigned function:4;
+	 unsigned int_type:1;
+	 unsigned precision:1;
+	 unsigned saturate:1;
+	 unsigned data_type:1;
+	 unsigned pad0:8;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } math;
+
+      struct {
+	 unsigned binding_table_index:8;
+	 unsigned sampler:4;
+	 unsigned return_format:2;
+	 unsigned msg_type:2;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } sampler;
+
+      struct brw_urb_immediate urb;
+
+      struct {
+	 unsigned binding_table_index:8;
+	 unsigned msg_control:4;
+	 unsigned msg_type:2;
+	 unsigned target_cache:2;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } dp_read;
+
+      struct {
+	 unsigned binding_table_index:8;
+	 unsigned msg_control:3;
+	 unsigned pixel_scoreboard_clear:1;
+	 unsigned msg_type:3;
+	 unsigned send_commit_msg:1;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } dp_write;
+
+      struct {
+	 unsigned pad:16;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } generic;
+
+      int d;
+      unsigned ud;
+   } bits3;
+};
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
new file mode 100644
index 0000000000..0a95dce194
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_surface.c
@@ -0,0 +1,127 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "brw_blit.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_tile.h"
+#include "util/u_rect.h"
+
+
+
+/* Assumes all values are within bounds -- no checking at this level -
+ * do it higher up if required.
+ */
+static void
+brw_surface_copy(struct pipe_context *pipe,
+                 boolean do_flip,
+                 struct pipe_surface *dst,
+                 unsigned dstx, unsigned dsty,
+                 struct pipe_surface *src,
+                 unsigned srcx, unsigned srcy, unsigned width, unsigned height)
+{
+   assert( dst != src );
+   assert( dst->block.size == src->block.size );
+   assert( dst->block.width == src->block.height );
+   assert( dst->block.height == src->block.height );
+
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+      
+      const void *src_map = pipe->screen->surface_map( pipe->screen,
+                                                       src,
+                                                       PIPE_BUFFER_USAGE_CPU_READ );
+      
+      pipe_copy_rect(dst_map,
+                     &dst->block,
+                     dst->stride,
+                     dstx, dsty, 
+                     width, height, 
+                     src_map, 
+                     do_flip ? -(int) src->stride : src->stride, 
+                     srcx, do_flip ? height - 1 - srcy : srcy);
+
+      pipe->screen->surface_unmap(pipe->screen, src);
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      struct brw_texture *dst_tex = (struct brw_texture *)dst->texture;
+      struct brw_texture *src_tex = (struct brw_texture *)src->texture;
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      brw_copy_blit(brw_context(pipe),
+                    do_flip,
+                    dst->block.size,
+                    (short) src->stride/src->block.size, src_tex->buffer, src->offset, FALSE,
+                    (short) dst->stride/dst->block.size, dst_tex->buffer, dst->offset, FALSE,
+                    (short) srcx, (short) srcy, (short) dstx, (short) dsty,
+                    (short) width, (short) height, PIPE_LOGICOP_COPY);
+   }
+}
+
+
+static void
+brw_surface_fill(struct pipe_context *pipe,
+                 struct pipe_surface *dst,
+                 unsigned dstx, unsigned dsty,
+                 unsigned width, unsigned height, unsigned value)
+{
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+
+      pipe_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
+
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      struct brw_texture *tex = (struct brw_texture *)dst->texture;
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      brw_fill_blit(brw_context(pipe),
+                    dst->block.size,
+                    (short) dst->stride/dst->block.size, 
+                    tex->buffer, dst->offset, FALSE,
+                    (short) dstx, (short) dsty,
+                    (short) width, (short) height,
+                    value);
+   }
+}
+
+
+void
+brw_init_surface_functions(struct brw_context *brw)
+{
+   brw->pipe.surface_copy  = brw_surface_copy;
+   brw->pipe.surface_fill  = brw_surface_fill;
+}
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c
new file mode 100644
index 0000000000..448229ed4e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.c
@@ -0,0 +1,398 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+/* Code to layout images in a mipmap tree for i965.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "brw_context.h"
+#include "brw_tex_layout.h"
+
+
+#define FILE_DEBUG_FLAG DEBUG_TEXTURE
+
+#if 0
+unsigned intel_compressed_alignment(unsigned internalFormat)
+{
+    unsigned alignment = 4;
+
+    switch (internalFormat) {
+    case GL_COMPRESSED_RGB_FXT1_3DFX:
+    case GL_COMPRESSED_RGBA_FXT1_3DFX:
+        alignment = 8;
+        break;
+
+    default:
+        break;
+    }
+
+    return alignment;
+}
+#endif
+
+static unsigned minify( unsigned d )
+{
+   return MAX2(1, d>>1);
+}
+
+
+static void intel_miptree_set_image_offset(struct brw_texture *tex,
+                                           unsigned level,
+                                           unsigned img,
+                                           unsigned x, unsigned y)
+{
+   struct pipe_texture *pt = &tex->base;
+   if (img == 0 && level == 0)
+      assert(x == 0 && y == 0);
+   assert(img < tex->nr_images[level]);
+
+   tex->image_offset[level][img] = y * tex->stride + x * pt->block.size;
+}
+
+static void intel_miptree_set_level_info(struct brw_texture *tex,
+                                         unsigned level,
+                                         unsigned nr_images,
+                                         unsigned x, unsigned y,
+                                         unsigned w, unsigned h, unsigned d)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   assert(level < PIPE_MAX_TEXTURE_LEVELS);
+
+   pt->width[level] = w;
+   pt->height[level] = h;
+   pt->depth[level] = d;
+   
+   pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w);
+   pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h);
+
+   tex->level_offset[level] = y * tex->stride + x * tex->base.block.size;
+   tex->nr_images[level] = nr_images;
+
+   /*
+   DBG("%s level %d size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+       level, w, h, d, x, y, tex->level_offset[level]);
+   */
+
+   /* Not sure when this would happen, but anyway: 
+    */
+   if (tex->image_offset[level]) {
+      FREE(tex->image_offset[level]);
+      tex->image_offset[level] = NULL;
+   }
+
+   assert(nr_images);
+   assert(!tex->image_offset[level]);
+
+   tex->image_offset[level] = (unsigned *) MALLOC(nr_images * sizeof(unsigned));
+   tex->image_offset[level][0] = 0;
+}
+
+static void i945_miptree_layout_2d(struct brw_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   const int align_x = 2, align_y = 4;
+   unsigned level;
+   unsigned x = 0;
+   unsigned y = 0;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned nblocksx = pt->nblocksx[0];
+   unsigned nblocksy = pt->nblocksy[0];
+
+   tex->stride = align(pt->nblocksx[0] * pt->block.size, 4);
+
+   /* May need to adjust pitch to accomodate the placement of
+    * the 2nd mipmap level.  This occurs when the alignment
+    * constraints of mipmap placement push the right edge of the
+    * 2nd mipmap level out past the width of its parent.
+    */
+   if (pt->last_level > 0) {
+      unsigned mip1_nblocksx 
+	 = align(pf_get_nblocksx(&pt->block, minify(width)), align_x)
+         + pf_get_nblocksx(&pt->block, minify(minify(width)));
+
+      if (mip1_nblocksx > nblocksx)
+	 tex->stride = mip1_nblocksx * pt->block.size;
+   }
+
+   /* Pitch must be a whole number of dwords
+    */
+   tex->stride = align(tex->stride, 64);
+   tex->total_nblocksy = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      intel_miptree_set_level_info(tex, level, 1, x, y, width,
+				   height, 1);
+
+      nblocksy = align(nblocksy, align_y);
+
+      /* Because the images are packed better, the final offset
+       * might not be the maximal one:
+       */
+      tex->total_nblocksy = MAX2(tex->total_nblocksy, y + nblocksy);
+
+      /* Layout_below: step right after second mipmap level.
+       */
+      if (level == 1) {
+	 x += align(nblocksx, align_x);
+      }
+      else {
+	 y += nblocksy;
+      }
+
+      width  = minify(width);
+      height = minify(height);
+      nblocksx = pf_get_nblocksx(&pt->block, width);
+      nblocksy = pf_get_nblocksy(&pt->block, height);
+   }
+}
+
+static boolean brw_miptree_layout(struct brw_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   /* XXX: these vary depending on image format:
+    */
+/*    int align_w = 4; */
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_3D: {
+      unsigned width  = pt->width[0];
+      unsigned height = pt->height[0];
+      unsigned depth = pt->depth[0];
+      unsigned nblocksx = pt->nblocksx[0];
+      unsigned nblocksy = pt->nblocksy[0];
+      unsigned pack_x_pitch, pack_x_nr;
+      unsigned pack_y_pitch;
+      unsigned level;
+      unsigned align_h = 2;
+      unsigned align_w = 4;
+
+      tex->total_nblocksy = 0;
+
+      tex->stride = align(pt->nblocksx[0], 4);
+      pack_y_pitch = align(pt->nblocksy[0], align_h);
+
+      pack_x_pitch = tex->stride / pt->block.size;
+      pack_x_nr = 1;
+
+      for (level = 0; level <= pt->last_level; level++) {
+	 unsigned nr_images = pt->target == PIPE_TEXTURE_3D ? depth : 6;
+	 int x = 0;
+	 int y = 0;
+	 uint q, j;
+
+	 intel_miptree_set_level_info(tex, level, nr_images,
+				      0, tex->total_nblocksy,
+				      width, height, depth);
+
+	 for (q = 0; q < nr_images;) {
+	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+	       intel_miptree_set_image_offset(tex, level, q, x, y);
+	       x += pack_x_pitch;
+	    }
+
+	    x = 0;
+	    y += pack_y_pitch;
+	 }
+
+
+	 tex->total_nblocksy += y;
+	 width  = minify(width);
+	 height = minify(height);
+	 depth  = minify(depth);
+         nblocksx = pf_get_nblocksx(&pt->block, width);
+         nblocksy = pf_get_nblocksy(&pt->block, height);
+
+         if (pt->compressed) {
+            pack_y_pitch = (height + 3) / 4;
+
+            if (pack_x_pitch > align(width, align_w)) {
+               pack_x_pitch = align(width, align_w);
+               pack_x_nr <<= 1;
+            }
+         } else {
+            if (pack_x_pitch > 4) {
+               pack_x_pitch >>= 1;
+               pack_x_nr <<= 1;
+               assert(pack_x_pitch * pack_x_nr * pt->block.size <= tex->stride);
+            }
+
+            if (pack_y_pitch > 2) {
+               pack_y_pitch >>= 1;
+               pack_y_pitch = align(pack_y_pitch, align_h);
+            }
+         }
+
+      }
+      break;
+   }
+
+   default:
+      i945_miptree_layout_2d(tex);
+      break;
+   }
+#if 0
+   PRINT("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+       pt->pitch,
+       pt->total_nblocksy,
+       pt->block.size,
+       pt->stride * pt->total_nblocksy );
+#endif
+
+   return TRUE;
+}
+
+
+static struct pipe_texture *
+brw_texture_create_screen(struct pipe_screen *screen,
+                          const struct pipe_texture *templat)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct brw_texture *tex = CALLOC_STRUCT(brw_texture);
+
+   if (tex) {
+      tex->base = *templat;
+      tex->base.refcount = 1;
+
+      tex->base.nblocksx[0] = pf_get_nblocksx(&tex->base.block, tex->base.width[0]);
+      tex->base.nblocksy[0] = pf_get_nblocksy(&tex->base.block, tex->base.height[0]);
+   
+      if (brw_miptree_layout(tex))
+	 tex->buffer = ws->buffer_create(ws, 64,
+                                          PIPE_BUFFER_USAGE_PIXEL,
+                                          tex->stride *
+                                          tex->total_nblocksy);
+
+      if (!tex->buffer) {
+	 FREE(tex);
+         return NULL;
+      }
+   }
+
+   return &tex->base;
+}
+
+
+static void
+brw_texture_release_screen(struct pipe_screen *screen,
+                           struct pipe_texture **pt)
+{
+   if (!*pt)
+      return;
+
+   /*
+   DBG("%s %p refcount will be %d\n",
+       __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
+   */
+   if (--(*pt)->refcount <= 0) {
+      struct brw_texture *tex = (struct brw_texture *)*pt;
+      uint i;
+
+      /*
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
+      */
+
+      pipe_buffer_reference(screen, &tex->buffer, NULL);
+
+      for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
+         if (tex->image_offset[i])
+            free(tex->image_offset[i]);
+
+      free(tex);
+   }
+   *pt = NULL;
+}
+
+
+static struct pipe_surface *
+brw_get_tex_surface_screen(struct pipe_screen *screen,
+                           struct pipe_texture *pt,
+                           unsigned face, unsigned level, unsigned zslice)
+{
+   struct brw_texture *tex = (struct brw_texture *)pt;
+   struct pipe_surface *ps;
+   unsigned offset;  /* in bytes */
+
+   offset = tex->level_offset[level];
+
+   if (pt->target == PIPE_TEXTURE_CUBE) {
+      offset += tex->image_offset[level][face];
+   }
+   else if (pt->target == PIPE_TEXTURE_3D) {
+      offset += tex->image_offset[level][zslice];
+   }
+   else {
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      ps->refcount = 1;
+      pipe_texture_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->block = pt->block;
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = tex->stride;
+      ps->offset = offset;
+      ps->status = PIPE_SURFACE_STATUS_DEFINED;
+   }
+   return ps;
+}
+
+
+void
+brw_init_texture_functions(struct brw_context *brw)
+{
+//   brw->pipe.texture_update = brw_texture_update;
+}
+
+
+void
+brw_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->texture_create  = brw_texture_create_screen;
+   screen->texture_release = brw_texture_release_screen;
+   screen->get_tex_surface = brw_get_tex_surface_screen;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.h b/src/gallium/drivers/i965simple/brw_tex_layout.h
new file mode 100644
index 0000000000..a6b6ba8146
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.h
@@ -0,0 +1,44 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+
+
+#ifndef BRW_TEX_LAYOUT_H
+#define BRW_TEX_LAYOUT_H
+
+
+struct brw_context;
+struct pipe_screen;
+
+
+extern void
+brw_init_texture_functions(struct brw_context *brw);
+
+extern void
+brw_init_screen_texture_funcs(struct pipe_screen *screen);
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_urb.c b/src/gallium/drivers/i965simple/brw_urb.c
new file mode 100644
index 0000000000..101a4367b9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_urb.c
@@ -0,0 +1,186 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+//#include "brw_state.h"
+#include "brw_batch.h"
+#include "brw_defines.h"
+
+#define VS 0
+#define GS 1
+#define CLP 2
+#define SF 3
+#define CS 4
+
+/* XXX: Are the min_entry_size numbers useful?
+ * XXX: Verify min_nr_entries, esp for VS.
+ * XXX: Verify SF min_entry_size.
+ */
+static const struct {
+   unsigned min_nr_entries;
+   unsigned preferred_nr_entries;
+   unsigned min_entry_size;
+   unsigned max_entry_size;
+} limits[CS+1] = {
+   { 8, 32, 1, 5 },			/* vs */
+   { 4, 8,  1, 5 },			/* gs */
+   { 6, 8,  1, 5 },			/* clp */
+   { 1, 8,  1, 12 },		        /* sf */
+   { 1, 4,  1, 32 }			/* cs */
+};
+
+
+static boolean check_urb_layout( struct brw_context *brw )
+{
+   brw->urb.vs_start = 0;
+   brw->urb.gs_start = brw->urb.nr_vs_entries * brw->urb.vsize;
+   brw->urb.clip_start = brw->urb.gs_start + brw->urb.nr_gs_entries * brw->urb.vsize;
+   brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
+   brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
+
+   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= 256;
+}
+
+/* Most minimal update, forces re-emit of URB fence packet after GS
+ * unit turned on/off.
+ */
+static void recalculate_urb_fence( struct brw_context *brw )
+{
+   unsigned csize = brw->curbe.total_size;
+   unsigned vsize = brw->vs.prog_data->urb_entry_size;
+   unsigned sfsize = brw->sf.prog_data->urb_entry_size;
+
+   if (csize < limits[CS].min_entry_size)
+      csize = limits[CS].min_entry_size;
+
+   if (vsize < limits[VS].min_entry_size)
+      vsize = limits[VS].min_entry_size;
+
+   if (sfsize < limits[SF].min_entry_size)
+      sfsize = limits[SF].min_entry_size;
+
+   if (brw->urb.vsize < vsize ||
+       brw->urb.sfsize < sfsize ||
+       brw->urb.csize < csize ||
+       (brw->urb.constrained && (brw->urb.vsize > brw->urb.vsize ||
+				 brw->urb.sfsize > brw->urb.sfsize ||
+				 brw->urb.csize > brw->urb.csize))) {
+
+
+      brw->urb.csize = csize;
+      brw->urb.sfsize = sfsize;
+      brw->urb.vsize = vsize;
+
+      brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+      brw->urb.nr_gs_entries = limits[GS].preferred_nr_entries;
+      brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
+      brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
+      brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;
+
+      if (!check_urb_layout(brw)) {
+	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;
+	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;
+	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
+	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;
+	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;
+
+	 brw->urb.constrained = 1;
+
+	 if (!check_urb_layout(brw)) {
+	    /* This is impossible, given the maximal sizes of urb
+	     * entries and the values for minimum nr of entries
+	     * provided above.
+	     */
+	    debug_printf("couldn't calculate URB layout!\n");
+	    exit(1);
+	 }
+
+	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
+	    debug_printf("URB CONSTRAINED\n");
+      }
+      else
+	 brw->urb.constrained = 0;
+
+      if (BRW_DEBUG & DEBUG_URB)
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+		      brw->urb.vs_start,
+		      brw->urb.gs_start,
+		      brw->urb.clip_start,
+		      brw->urb.sf_start,
+		      brw->urb.cs_start,
+		      256);
+
+      brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
+   }
+}
+
+
+const struct brw_tracked_state brw_recalculate_urb_fence = {
+   .dirty = {
+      .brw = BRW_NEW_CURBE_OFFSETS,
+      .cache = (CACHE_NEW_VS_PROG |
+		CACHE_NEW_SF_PROG)
+   },
+   .update = recalculate_urb_fence
+};
+
+
+
+
+
+void brw_upload_urb_fence(struct brw_context *brw)
+{
+   struct brw_urb_fence uf;
+   memset(&uf, 0, sizeof(uf));
+
+   uf.header.opcode = CMD_URB_FENCE;
+   uf.header.length = sizeof(uf)/4-2;
+   uf.header.vs_realloc = 1;
+   uf.header.gs_realloc = 1;
+   uf.header.clp_realloc = 1;
+   uf.header.sf_realloc = 1;
+   uf.header.vfe_realloc = 1;
+   uf.header.cs_realloc = 1;
+
+   /* The ordering below is correct, not the layout in the
+    * instruction.
+    *
+    * There are 256 urb reg pairs in total.
+    */
+   uf.bits0.vs_fence  = brw->urb.gs_start;
+   uf.bits0.gs_fence  = brw->urb.clip_start;
+   uf.bits0.clp_fence = brw->urb.sf_start;
+   uf.bits1.sf_fence  = brw->urb.cs_start;
+   uf.bits1.cs_fence  = 256;
+
+   BRW_BATCH_STRUCT(brw, &uf);
+}
diff --git a/src/gallium/drivers/i965simple/brw_util.c b/src/gallium/drivers/i965simple/brw_util.c
new file mode 100644
index 0000000000..42391d7c8c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_util.c
@@ -0,0 +1,104 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_util.h"
+#include "brw_defines.h"
+
+#include "pipe/p_defines.h"
+
+unsigned brw_count_bits( unsigned val )
+{
+   unsigned i;
+   for (i = 0; val ; val >>= 1)
+      if (val & 1)
+	 i++;
+   return i;
+}
+
+
+unsigned brw_translate_blend_equation( int mode )
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD:
+      return BRW_BLENDFUNCTION_ADD;
+   case PIPE_BLEND_MIN:
+      return BRW_BLENDFUNCTION_MIN;
+   case PIPE_BLEND_MAX:
+      return BRW_BLENDFUNCTION_MAX;
+   case PIPE_BLEND_SUBTRACT:
+      return BRW_BLENDFUNCTION_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return BRW_BLENDFUNCTION_REVERSE_SUBTRACT;
+   default:
+      assert(0);
+      return BRW_BLENDFUNCTION_ADD;
+   }
+}
+
+unsigned brw_translate_blend_factor( int factor )
+{
+   switch(factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return BRW_BLENDFACTOR_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return BRW_BLENDFACTOR_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_ONE:
+      return BRW_BLENDFACTOR_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return BRW_BLENDFACTOR_SRC_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return BRW_BLENDFACTOR_INV_SRC_COLOR;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return BRW_BLENDFACTOR_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      return BRW_BLENDFACTOR_INV_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return BRW_BLENDFACTOR_INV_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return BRW_BLENDFACTOR_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return BRW_BLENDFACTOR_INV_DST_ALPHA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return BRW_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return BRW_BLENDFACTOR_CONST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return BRW_BLENDFACTOR_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return BRW_BLENDFACTOR_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return BRW_BLENDFACTOR_INV_CONST_ALPHA;
+   default:
+      assert(0);
+      return BRW_BLENDFACTOR_ZERO;
+   }
+}
diff --git a/src/gallium/drivers/i965simple/brw_util.h b/src/gallium/drivers/i965simple/brw_util.h
new file mode 100644
index 0000000000..d60e5934db
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_util.h
@@ -0,0 +1,43 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+          
+
+#ifndef BRW_UTIL_H
+#define BRW_UTIL_H
+
+#include "pipe/p_state.h"
+
+extern unsigned brw_count_bits( unsigned val );
+extern unsigned brw_translate_blend_factor( int factor );
+extern unsigned brw_translate_blend_equation( int mode );
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_vs.c b/src/gallium/drivers/i965simple/brw_vs.c
new file mode 100644
index 0000000000..92327e896d
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs.c
@@ -0,0 +1,120 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_vs.h"
+#include "brw_util.h"
+#include "brw_state.h"
+
+
+static void do_vs_prog( struct brw_context *brw,
+			const struct brw_vertex_program *vp,
+			struct brw_vs_prog_key *key )
+{
+   unsigned program_size;
+   const unsigned *program;
+   struct brw_vs_compile c;
+
+   memset(&c, 0, sizeof(c));
+   memcpy(&c.key, key, sizeof(*key));
+
+   brw_init_compile(&c.func);
+   c.vp = vp;
+
+   c.prog_data.outputs_written = vp->info.num_outputs;
+   c.prog_data.inputs_read = vp->info.num_inputs;
+
+#if 0
+   if (c.key.copy_edgeflag) {
+      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
+      c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
+   }
+#endif
+
+   /* Emit GEN4 code.
+    */
+   brw_vs_emit(&c);
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /*
+    */
+   brw->vs.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_VS_PROG],
+					      &c.key,
+					      sizeof(c.key),
+					      program,
+					      program_size,
+					      &c.prog_data,
+					      &brw->vs.prog_data);
+}
+
+
+static void brw_upload_vs_prog( struct brw_context *brw )
+{
+   struct brw_vs_prog_key key;
+   const struct brw_vertex_program *vp = brw->attribs.VertexProgram;
+
+   assert(vp);
+
+   memset(&key, 0, sizeof(key));
+
+   /* Just upload the program verbatim for now.  Always send it all
+    * the inputs it asks for, whether they are varying or not.
+    */
+   key.program_string_id = vp->id;
+   key.nr_userclip = brw->attribs.Clip.nr;
+   key.copy_edgeflag = (brw->attribs.Raster->fill_cw != PIPE_POLYGON_MODE_FILL ||
+			brw->attribs.Raster->fill_ccw != PIPE_POLYGON_MODE_FILL);
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache[BRW_VS_PROG],
+			&key, sizeof(key),
+			&brw->vs.prog_data,
+			&brw->vs.prog_gs_offset))
+       return;
+
+   do_vs_prog(brw, vp, &key);
+}
+
+
+/* See brw_vs.c:
+ */
+const struct brw_tracked_state brw_vs_prog = {
+   .dirty = {
+      .brw   = BRW_NEW_VS,
+      .cache = 0
+   },
+   .update = brw_upload_vs_prog
+};
diff --git a/src/gallium/drivers/i965simple/brw_vs.h b/src/gallium/drivers/i965simple/brw_vs.h
new file mode 100644
index 0000000000..070f9dfcae
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs.h
@@ -0,0 +1,82 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_VS_H
+#define BRW_VS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+struct brw_vs_prog_key {
+   unsigned program_string_id;
+   unsigned nr_userclip:4;
+   unsigned copy_edgeflag:1;
+   unsigned know_w_is_one:1;
+   unsigned pad:26;
+};
+
+
+struct brw_vs_compile {
+   struct brw_compile func;
+   struct brw_vs_prog_key key;
+   struct brw_vs_prog_data prog_data;
+
+   const struct brw_vertex_program *vp;
+
+   unsigned nr_inputs;
+
+   unsigned first_output;
+   unsigned nr_outputs;
+
+   unsigned first_tmp;
+   unsigned last_tmp;
+
+   struct brw_reg r0;
+   struct brw_reg r1;
+   struct brw_reg regs[12][128];
+   struct brw_reg tmp;
+   struct brw_reg stack;
+
+   struct {
+       boolean used_in_src;
+       struct brw_reg reg;
+   } output_regs[128];
+
+   struct brw_reg userplane[6];
+
+};
+
+void brw_vs_emit( struct brw_vs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_vs_emit.c b/src/gallium/drivers/i965simple/brw_vs_emit.c
new file mode 100644
index 0000000000..e03d653482
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs_emit.c
@@ -0,0 +1,1330 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_vs.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+struct brw_prog_info {
+   unsigned num_temps;
+   unsigned num_addrs;
+   unsigned num_consts;
+
+   unsigned writes_psize;
+
+   unsigned pos_idx;
+   unsigned result_edge_idx;
+   unsigned edge_flag_idx;
+   unsigned psize_idx;
+};
+
+/* Do things as simply as possible.  Allocate and populate all regs
+ * ahead of time.
+ */
+static void brw_vs_alloc_regs( struct brw_vs_compile *c,
+                               struct brw_prog_info *info )
+{
+   unsigned i, reg = 0, mrf;
+   unsigned nr_params;
+
+   /* r0 -- reserved as usual
+    */
+   c->r0 = brw_vec8_grf(reg, 0); reg++;
+
+   /* User clip planes from curbe:
+    */
+   if (c->key.nr_userclip) {
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
+      }
+
+      /* Deal with curbe alignment:
+       */
+      reg += ((6+c->key.nr_userclip+3)/4)*2;
+   }
+
+   /* Vertex program parameters from curbe:
+    */
+   nr_params = c->prog_data.max_const;
+   for (i = 0; i < nr_params; i++) {
+      c->regs[TGSI_FILE_CONSTANT][i] = stride(brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
+   }
+   reg += (nr_params+1)/2;
+   c->prog_data.curb_read_length = reg - 1;
+
+
+
+   /* Allocate input regs:
+    */
+   c->nr_inputs = c->vp->info.num_inputs;
+   for (i = 0; i < c->nr_inputs; i++) {
+	 c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
+	 reg++;
+   }
+
+
+   /* Allocate outputs: TODO: could organize the non-position outputs
+    * to go straight into message regs.
+    */
+   c->nr_outputs = 0;
+   c->first_output = reg;
+   mrf = 4;
+   for (i = 0; i < c->vp->info.num_outputs; i++) {
+      c->nr_outputs++;
+#if 0
+      if (i == VERT_RESULT_HPOS) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+      else if (i == VERT_RESULT_PSIZ) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+         mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
+      }
+      else {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
+         mrf++;
+      }
+#else
+      /*treat pos differently for now */
+      if (i == info->pos_idx) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+      } else {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
+         mrf++;
+      }
+#endif
+   }
+
+   /* Allocate program temporaries:
+    */
+   for (i = 0; i < info->num_temps; i++) {
+      c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
+      reg++;
+   }
+
+   /* Address reg(s).  Don't try to use the internal address reg until
+    * deref time.
+    */
+   for (i = 0; i < info->num_addrs; i++) {
+      c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
+                                               reg,
+                                               0,
+                                               BRW_REGISTER_TYPE_D,
+                                               BRW_VERTICAL_STRIDE_8,
+                                               BRW_WIDTH_8,
+                                               BRW_HORIZONTAL_STRIDE_1,
+                                               BRW_SWIZZLE_XXXX,
+                                               TGSI_WRITEMASK_X);
+      reg++;
+   }
+
+   for (i = 0; i < 128; i++) {
+      if (c->output_regs[i].used_in_src) {
+         c->output_regs[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+
+   c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
+   reg += 2;
+
+
+   /* Some opcodes need an internal temporary:
+    */
+   c->first_tmp = reg;
+   c->last_tmp = reg;		/* for allocation purposes */
+
+   /* Each input reg holds data from two vertices.  The
+    * urb_read_length is the number of registers read from *each*
+    * vertex urb, so is half the amount:
+    */
+   c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
+
+   c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
+   c->prog_data.total_grf = reg;
+}
+
+
+static struct brw_reg get_tmp( struct brw_vs_compile *c )
+{
+   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+static void release_tmps( struct brw_vs_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+static void unalias1( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if (dst.file == arg0.file && dst.nr == arg0.nr) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0);
+      brw_MOV(p, dst, tmp);
+   }
+   else {
+      func(c, dst, arg0);
+   }
+}
+
+static void unalias2( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
+       (dst.file == arg1.file && dst.nr == arg1.nr)) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0, arg1);
+      brw_MOV(p, dst, tmp);
+   }
+   else {
+      func(c, dst, arg0, arg1);
+   }
+}
+
+static void emit_sop( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1,
+		      unsigned cond)
+{
+   brw_push_insn_state(p);
+   brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_MOV(p, dst, brw_imm_f(1.0f));
+   brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+   brw_MOV(p, dst, brw_imm_f(0.0f));
+   brw_pop_insn_state(p);
+}
+
+static void emit_seq( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
+}
+
+static void emit_sne( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
+}
+static void emit_slt( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
+}
+
+static void emit_sle( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
+}
+
+static void emit_sgt( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
+}
+
+static void emit_sge( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+  emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
+}
+
+static void emit_max( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg1, arg0);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+static void emit_min( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg0, arg1);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+static void emit_math1( struct brw_vs_compile *c,
+			unsigned function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			unsigned precision)
+{
+   /* There are various odd behaviours with SEND on the simulator.  In
+    * addition there are documented issues with the fact that the GEN4
+    * processor doesn't do dependency control properly on SEND
+    * results.  So, on balance, this kludge to get around failures
+    * with writemasked math results looks like it might be necessary
+    * whether that turns out to be a simulator bug or not:
+    */
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_math(p,
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+static void emit_math2( struct brw_vs_compile *c,
+			unsigned function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			unsigned precision)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_MOV(p, brw_message_reg(3), arg1);
+
+   brw_math(p,
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+ 	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+
+static void emit_exp_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X) {
+      struct brw_reg tmp = get_tmp(c);
+      struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
+
+      /* tmp_d = floor(arg0.x) */
+      brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
+
+      /* result[0] = 2.0 ^ tmp */
+
+      /* Adjust exponent for floating point:
+       * exp += 127
+       */
+      brw_ADD(p, brw_writemask(tmp_d, TGSI_WRITEMASK_X), tmp_d, brw_imm_d(127));
+
+      /* Install exponent and sign.
+       * Excess drops off the edge:
+       */
+      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), TGSI_WRITEMASK_X),
+	      tmp_d, brw_imm_d(23));
+
+      release_tmp(c, tmp);
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y) {
+      /* result[1] = arg0.x - floor(arg0.x) */
+      brw_FRC(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0, 0));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
+      /* As with the LOG instruction, we might be better off just
+       * doing a taylor expansion here, seeing as we have to do all
+       * the prep work.
+       *
+       * If mathbox partial precision is too low, consider also:
+       * result[3] = result[0] * EXP(result[1])
+       */
+      emit_math1(c,
+		 BRW_MATH_FUNCTION_EXP,
+		 brw_writemask(dst, TGSI_WRITEMASK_Z),
+		 brw_swizzle1(arg0, 0),
+		 BRW_MATH_PRECISION_PARTIAL);
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), brw_imm_f(1));
+   }
+}
+
+
+static void emit_log_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
+   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) {
+      tmp = get_tmp(c);
+      tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   }
+
+   /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
+    * according to spec:
+    *
+    * These almost look likey they could be joined up, but not really
+    * practical:
+    *
+    * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
+    * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
+    */
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_XZ) {
+      brw_AND(p,
+	      brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1U<<31)-1));
+
+      brw_SHR(p,
+	      brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
+	      tmp_ud,
+	      brw_imm_ud(23));
+
+      brw_ADD(p,
+	      brw_writemask(tmp, TGSI_WRITEMASK_X),
+	      retype(tmp_ud, BRW_REGISTER_TYPE_D),	/* does it matter? */
+	      brw_imm_d(-127));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_YZ) {
+      brw_AND(p,
+	      brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1<<23)-1));
+
+      brw_OR(p,
+	     brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
+	     tmp_ud,
+	     brw_imm_ud(127<<23));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
+      /* result[2] = result[0] + LOG2(result[1]); */
+
+      /* Why bother?  The above is just a hint how to do this with a
+       * taylor series.  Maybe we *should* use a taylor series as by
+       * the time all the above has been done it's almost certainly
+       * quicker than calling the mathbox, even with low precision.
+       *
+       * Options are:
+       *    - result[0] + mathbox.LOG2(result[1])
+       *    - mathbox.LOG2(arg0.x)
+       *    - result[0] + inline_taylor_approx(result[1])
+       */
+      emit_math1(c,
+		 BRW_MATH_FUNCTION_LOG,
+		 brw_writemask(tmp, TGSI_WRITEMASK_Z),
+		 brw_swizzle1(tmp, 1),
+		 BRW_MATH_PRECISION_FULL);
+
+      brw_ADD(p,
+	      brw_writemask(tmp, TGSI_WRITEMASK_Z),
+	      brw_swizzle1(tmp, 2),
+	      brw_swizzle1(tmp, 0));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_W), brw_imm_f(1));
+   }
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+
+
+/* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
+ */
+static void emit_dst_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0,
+			      struct brw_reg arg1)
+{
+   struct brw_compile *p = &c->func;
+
+   /* There must be a better way to do this:
+    */
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X)
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_X), brw_imm_f(1.0));
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y)
+      brw_MUL(p, brw_writemask(dst, TGSI_WRITEMASK_Y), arg0, arg1);
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z)
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Z), arg0);
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W)
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), arg1);
+}
+
+static void emit_xpd( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg t,
+		      struct brw_reg u)
+{
+   brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
+   brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
+}
+
+
+
+static void emit_lit_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_YZ), brw_imm_f(0));
+   brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_XW), brw_imm_f(1));
+
+   /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisions.
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
+   if_insn = brw_IF(p, BRW_EXECUTE_8);
+   {
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0,0));
+
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
+      brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_Z),  brw_swizzle1(arg0,1));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      emit_math2(c,
+		 BRW_MATH_FUNCTION_POW,
+		 brw_writemask(dst, TGSI_WRITEMASK_Z),
+		 brw_swizzle1(tmp, 2),
+		 brw_swizzle1(arg0, 3),
+		 BRW_MATH_PRECISION_PARTIAL);
+   }
+
+   brw_ENDIF(p, if_insn);
+}
+
+
+
+
+
+/* TODO: relative addressing!
+ */
+static struct brw_reg get_reg( struct brw_vs_compile *c,
+			       unsigned file,
+			       unsigned index )
+{
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+      assert(c->regs[file][index].nr != 0);
+      return c->regs[file][index];
+   case TGSI_FILE_CONSTANT:
+      assert(c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm].nr != 0);
+      return c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm];
+   case TGSI_FILE_IMMEDIATE:
+      assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
+      return c->regs[TGSI_FILE_CONSTANT][index];
+   case TGSI_FILE_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case TGSI_FILE_NULL:			/* undef values */
+      return brw_null_reg();
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
+
+static struct brw_reg deref( struct brw_vs_compile *c,
+			     struct brw_reg arg,
+			     int offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = vec4(get_tmp(c));
+   struct brw_reg vp_address = retype(vec1(get_reg(c, TGSI_FILE_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
+   unsigned byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
+   struct brw_reg indirect = brw_vec4_indirect(0,0);
+
+   {
+      brw_push_insn_state(p);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+
+      /* This is pretty clunky - load the address register twice and
+       * fetch each 4-dword value in turn.  There must be a way to do
+       * this in a single pass, but I couldn't get it to work.
+       */
+      brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
+      brw_MOV(p, tmp, indirect);
+
+      brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
+      brw_MOV(p, suboffset(tmp, 4), indirect);
+
+      brw_pop_insn_state(p);
+   }
+
+   return vec8(tmp);
+}
+
+
+static void emit_arl( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_RNDD(p, tmp, arg0);
+   brw_MUL(p, dst, tmp, brw_imm_d(16));
+
+   if (need_tmp)
+      release_tmp(c, tmp);
+}
+
+
+/* Will return mangled results for SWZ op.  The emit_swz() function
+ * ignores this result and recalculates taking extended swizzles into
+ * account.
+ */
+static struct brw_reg get_arg( struct brw_vs_compile *c,
+			       struct tgsi_src_register *src )
+{
+   struct brw_reg reg;
+
+   if (src->File == TGSI_FILE_NULL)
+      return brw_null_reg();
+
+#if 0
+   if (src->RelAddr)
+      reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
+   else
+#endif
+      reg = get_reg(c, src->File, src->Index);
+
+   /* Convert 3-bit swizzle to 2-bit.
+    */
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->SwizzleX,
+				       src->SwizzleY,
+				       src->SwizzleZ,
+				       src->SwizzleW);
+
+   /* Note this is ok for non-swizzle instructions:
+    */
+   reg.negate = src->Negate ? 1 : 0;
+
+   return reg;
+}
+
+
+static struct brw_reg get_dst( struct brw_vs_compile *c,
+			       const struct tgsi_dst_register *dst )
+{
+   struct brw_reg reg = get_reg(c, dst->File, dst->Index);
+
+   reg.dw1.bits.writemask = dst->WriteMask;
+
+   return reg;
+}
+
+
+
+
+static void emit_swz( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct tgsi_src_register src )
+{
+   struct brw_compile *p = &c->func;
+   unsigned zeros_mask = 0;
+   unsigned ones_mask = 0;
+   unsigned src_mask = 0;
+   ubyte src_swz[4];
+   boolean need_tmp = (src.Negate &&
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+   struct brw_reg tmp = dst;
+   unsigned i;
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   for (i = 0; i < 4; i++) {
+      if (dst.dw1.bits.writemask & (1<<i)) {
+	 ubyte s = 0;
+         switch(i) {
+         case 0:
+            s = src.SwizzleX;
+            break;
+            s = src.SwizzleY;
+         case 1:
+            break;
+            s = src.SwizzleZ;
+         case 2:
+            break;
+            s = src.SwizzleW;
+         case 3:
+            break;
+         }
+	 switch (s) {
+	 case TGSI_SWIZZLE_X:
+	 case TGSI_SWIZZLE_Y:
+	 case TGSI_SWIZZLE_Z:
+	 case TGSI_SWIZZLE_W:
+	    src_mask |= 1<<i;
+	    src_swz[i] = s;
+	    break;
+	 case TGSI_EXTSWIZZLE_ZERO:
+	    zeros_mask |= 1<<i;
+	    break;
+	 case TGSI_EXTSWIZZLE_ONE:
+	    ones_mask |= 1<<i;
+	    break;
+	 }
+      }
+   }
+
+   /* Do src first, in case dst aliases src:
+    */
+   if (src_mask) {
+      struct brw_reg arg0;
+
+#if 0
+      if (src.RelAddr)
+	 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
+      else
+#endif
+	 arg0 = get_reg(c, src.File, src.Index);
+
+      arg0 = brw_swizzle(arg0,
+			 src_swz[0], src_swz[1],
+			 src_swz[2], src_swz[3]);
+
+      brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
+   }
+
+   if (zeros_mask)
+      brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
+
+   if (ones_mask)
+      brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
+
+   if (src.Negate)
+      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+
+/* Post-vertex-program processing.  Send the results to the URB.
+ */
+static void emit_vertex_write( struct brw_vs_compile *c, struct brw_prog_info *info)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg m0 = brw_message_reg(0);
+   struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][info->pos_idx];
+   struct brw_reg ndc;
+
+   if (c->key.copy_edgeflag) {
+      brw_MOV(p,
+	      get_reg(c, TGSI_FILE_OUTPUT, info->result_edge_idx),
+	      get_reg(c, TGSI_FILE_INPUT, info->edge_flag_idx));
+   }
+
+
+   /* Build ndc coords?   TODO: Shortcircuit when w is known to be one.
+    */
+   if (!c->key.know_w_is_one) {
+      ndc = get_tmp(c);
+      emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+      brw_MUL(p, brw_writemask(ndc, TGSI_WRITEMASK_XYZ), pos, ndc);
+   }
+   else {
+      ndc = pos;
+   }
+
+   /* This includes the workaround for -ve rhw, so is no longer an
+    * optional step:
+    */
+   if (info->writes_psize ||
+       c->key.nr_userclip ||
+       !c->key.know_w_is_one)
+   {
+      struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+      unsigned i;
+
+      brw_MOV(p, header1, brw_imm_ud(0));
+
+      brw_set_access_mode(p, BRW_ALIGN_16);
+
+      if (info->writes_psize) {
+	 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][info->psize_idx];
+	 brw_MUL(p, brw_writemask(header1, TGSI_WRITEMASK_W),
+                 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
+	 brw_AND(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1,
+                 brw_imm_ud(0x7ff<<8));
+      }
+
+
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
+	 brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<i));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+
+      /* i965 clipping workaround:
+       * 1) Test for -ve rhw
+       * 2) If set,
+       *      set ndc = (0,0,0,0)
+       *      set ucp[6] = 1
+       *
+       * Later, clipping will detect ucp[6] and ensure the primitive is
+       * clipped against all fixed planes.
+       */
+      if (!c->key.know_w_is_one) {
+	 brw_CMP(p,
+		 vec8(brw_null_reg()),
+		 BRW_CONDITIONAL_L,
+		 brw_swizzle1(ndc, 3),
+		 brw_imm_f(0));
+
+	 brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_MOV(p, ndc, brw_imm_f(0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+      brw_set_access_mode(p, BRW_ALIGN_1);	/* why? */
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
+      brw_set_access_mode(p, BRW_ALIGN_16);
+
+      release_tmp(c, header1);
+   }
+   else {
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+   }
+
+
+   /* Emit the (interleaved) headers for the two vertices - an 8-reg
+    * of zeros followed by two sets of NDC coordinates:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, offset(m0, 2), ndc);
+   brw_MOV(p, offset(m0, 3), pos);
+
+
+   brw_urb_WRITE(p,
+		 brw_null_reg(), /* dest */
+		 0,		/* starting mrf reg nr */
+		 c->r0,		/* src */
+		 0,		/* allocate */
+		 1,		/* used */
+		 c->nr_outputs + 3, /* msg len */
+		 0,		/* response len */
+		 1, 		/* eot */
+		 1, 		/* writes complete */
+		 0, 		/* urb destination offset */
+		 BRW_URB_SWIZZLE_INTERLEAVE);
+
+}
+
+static void
+post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
+{
+   struct tgsi_parse_context parse;
+   const struct tgsi_token *tokens = c->vp->program.tokens;
+   tgsi_parse_init(&parse, tokens);
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+      if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
+#if 0
+         struct brw_instruction *brw_inst1, *brw_inst2;
+         const struct tgsi_full_instruction *inst1, *inst2;
+         int offset;
+         inst1 = &parse.FullToken.FullInstruction;
+         brw_inst1 = inst1->Data;
+         switch (inst1->Opcode) {
+	 case TGSI_OPCODE_CAL:
+	 case TGSI_OPCODE_BRA:
+	    target_insn = inst1->BranchTarget;
+	    inst2 = &c->vp->program.Base.Instructions[target_insn];
+	    brw_inst2 = inst2->Data;
+	    offset = brw_inst2 - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 case TGSI_OPCODE_END:
+	    offset = end_inst - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 default:
+	    break;
+         }
+#endif
+      }
+   }
+   tgsi_parse_free(&parse);
+}
+
+static void process_declaration(const struct tgsi_full_declaration *decl,
+                                struct brw_prog_info *info)
+{
+   int first = decl->DeclarationRange.First;
+   int last = decl->DeclarationRange.Last;
+   
+   switch(decl->Declaration.File) {
+   case TGSI_FILE_CONSTANT: 
+      info->num_consts += last - first + 1;
+      break;
+   case TGSI_FILE_INPUT: {
+   }
+      break;
+   case TGSI_FILE_OUTPUT: {
+      assert(last == first);	/* for now */
+      if (decl->Declaration.Semantic) {
+         switch (decl->Semantic.SemanticName) {
+         case TGSI_SEMANTIC_POSITION: {
+            info->pos_idx = first;
+         }
+            break;
+         case TGSI_SEMANTIC_COLOR:
+            break;
+         case TGSI_SEMANTIC_BCOLOR:
+            break;
+         case TGSI_SEMANTIC_FOG:
+            break;
+         case TGSI_SEMANTIC_PSIZE: {
+            info->writes_psize = TRUE;
+            info->psize_idx = first;
+         }
+            break;
+         case TGSI_SEMANTIC_GENERIC:
+            break;
+         }
+      }
+   }
+      break;
+   case TGSI_FILE_TEMPORARY: {
+      info->num_temps += (last - first) + 1;
+   }
+      break;
+   case TGSI_FILE_SAMPLER: {
+   }
+      break;
+   case TGSI_FILE_ADDRESS: {
+      info->num_addrs += (last - first) + 1;
+   }
+      break;
+   case TGSI_FILE_IMMEDIATE: {
+   }
+      break;
+   case TGSI_FILE_NULL: {
+   }
+      break;
+   }
+}
+
+static void process_instruction(struct brw_vs_compile *c,
+                                struct tgsi_full_instruction *inst,
+                                struct brw_prog_info *info)
+{
+   struct brw_reg args[3], dst;
+   struct brw_compile *p = &c->func;
+   /*struct brw_indirect stack_index = brw_indirect(0, 0);*/
+   unsigned i;
+   unsigned index;
+   unsigned file;
+   /*FIXME: might not be the only one*/
+   const struct tgsi_dst_register *dst_reg = &inst->FullDstRegisters[0].DstRegister;
+   /*
+   struct brw_instruction *if_inst[MAX_IFSN];
+   unsigned insn, if_insn = 0;
+   */
+
+   for (i = 0; i < 3; i++) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      index = src->SrcRegister.Index;
+      file = src->SrcRegister.File;
+      if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
+         args[i] = c->output_regs[index].reg;
+      else
+         args[i] = get_arg(c, &src->SrcRegister);
+   }
+
+   /* Get dest regs.  Note that it is possible for a reg to be both
+    * dst and arg, given the static allocation of registers.  So
+    * care needs to be taken emitting multi-operation instructions.
+    */
+   index = dst_reg->Index;
+   file = dst_reg->File;
+   if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
+      dst = c->output_regs[index].reg;
+   else
+      dst = get_dst(c, dst_reg);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      brw_MOV(p, dst, brw_abs(args[0]));
+      break;
+   case TGSI_OPCODE_ADD:
+      brw_ADD(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP3:
+      brw_DP3(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP4:
+      brw_DP4(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DPH:
+      brw_DPH(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DST:
+      unalias2(c, dst, args[0], args[1], emit_dst_noalias);
+      break;
+   case TGSI_OPCODE_EXP:
+      unalias1(c, dst, args[0], emit_exp_noalias);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_ARL:
+      emit_arl(c, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FLR:
+      brw_RNDD(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FRC:
+      brw_FRC(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_LOG:
+      unalias1(c, dst, args[0], emit_log_noalias);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_LIT:
+      unalias1(c, dst, args[0], emit_lit_noalias);
+      break;
+   case TGSI_OPCODE_MAD:
+      brw_MOV(p, brw_acc_reg(), args[2]);
+      brw_MAC(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+#if 0
+      /* The args[0] value can't be used here as it won't have
+       * correctly encoded the full swizzle:
+       */
+      emit_swz(c, dst, inst->SrcReg[0] );
+#endif
+      brw_MOV(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_MUL:
+      brw_MUL(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      emit_seq(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sne(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sge(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sgt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_slt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sle(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SUB:
+      brw_ADD(p, dst, args[0], negate(args[1]));
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(p, dst, args[0], args[1]);
+      break;
+#if 0
+   case TGSI_OPCODE_IF:
+      assert(if_insn < MAX_IFSN);
+      if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_ELSE:
+      if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(if_insn > 0);
+      brw_ENDIF(p, if_inst[--if_insn]);
+      break;
+   case TGSI_OPCODE_BRA:
+      brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_set_predicate_control_flag_value(p, 0xff);
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1uw(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, get_addr_reg(stack_index),
+              get_addr_reg(stack_index), brw_imm_d(4));
+      inst->Data = &p->store[p->nr_insn];
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+#endif
+   case TGSI_OPCODE_RET:
+#if 0
+      brw_ADD(p, get_addr_reg(stack_index),
+              get_addr_reg(stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1uw(stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+#else
+      /*brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));*/
+#endif
+      break;
+   case TGSI_OPCODE_END:
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+      break;
+   default:
+      debug_printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
+      break;
+   }
+
+   if (dst_reg->File == TGSI_FILE_OUTPUT
+       && dst_reg->Index != info->pos_idx
+       && c->output_regs[dst_reg->Index].used_in_src)
+      brw_MOV(p, get_dst(c, dst_reg), dst);
+
+   release_tmps(c);
+}
+
+/* Emit the fragment program instructions here.
+ */
+void brw_vs_emit(struct brw_vs_compile *c)
+{
+#define MAX_IFSN 32
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *end_inst;
+   struct tgsi_parse_context parse;
+   struct brw_indirect stack_index = brw_indirect(0, 0);
+   const struct tgsi_token *tokens = c->vp->program.tokens;
+   struct brw_prog_info prog_info;
+   unsigned allocated_registers = 0;
+   memset(&prog_info, 0, sizeof(struct brw_prog_info));
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+
+   tgsi_parse_init(&parse, tokens);
+   /* Message registers can't be read, so copy the output into GRF register
+      if they are used in source registers */
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+      unsigned i;
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION: {
+         const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
+         for (i = 0; i < 3; ++i) {
+            const struct tgsi_src_register *src = &inst->FullSrcRegisters[i].SrcRegister;
+            unsigned index = src->Index;
+            unsigned file = src->File;
+            if (file == TGSI_FILE_OUTPUT)
+               c->output_regs[index].used_in_src = TRUE;
+         }
+      }
+         break;
+      default:
+         /* nothing */
+         break;
+      }
+   }
+   tgsi_parse_free(&parse);
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION: {
+         struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+         process_declaration(decl, &prog_info);
+      }
+         break;
+      case TGSI_TOKEN_TYPE_IMMEDIATE: {
+         struct tgsi_full_immediate *imm = &parse.FullToken.FullImmediate;
+         assert(imm->Immediate.NrTokens == 4 + 1);
+         c->prog_data.imm_buf[c->prog_data.num_imm][0] = imm->u.ImmediateFloat32[0].Float;
+         c->prog_data.imm_buf[c->prog_data.num_imm][1] = imm->u.ImmediateFloat32[1].Float;
+         c->prog_data.imm_buf[c->prog_data.num_imm][2] = imm->u.ImmediateFloat32[2].Float;
+         c->prog_data.imm_buf[c->prog_data.num_imm][3] = imm->u.ImmediateFloat32[3].Float;
+         c->prog_data.num_imm++;
+      }
+         break;
+      case TGSI_TOKEN_TYPE_INSTRUCTION: {
+         struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
+         if (!allocated_registers) {
+            /* first instruction (declerations finished).
+             * now that we know what vars are being used allocate
+             * registers for them.*/
+            c->prog_data.num_consts = prog_info.num_consts;
+            c->prog_data.max_const = prog_info.num_consts + c->prog_data.num_imm;
+            brw_vs_alloc_regs(c, &prog_info);
+
+	    brw_set_access_mode(p, BRW_ALIGN_1);
+            brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+	    brw_set_access_mode(p, BRW_ALIGN_16);
+            allocated_registers = 1;
+         }
+         process_instruction(c, inst, &prog_info);
+      }
+         break;
+      }
+   }
+
+   end_inst = &p->store[p->nr_insn];
+   emit_vertex_write(c, &prog_info);
+   post_vs_emit(c, end_inst);
+   tgsi_parse_free(&parse);
+
+}
diff --git a/src/gallium/drivers/i965simple/brw_vs_state.c b/src/gallium/drivers/i965simple/brw_vs_state.c
new file mode 100644
index 0000000000..1eaff87892
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs_state.c
@@ -0,0 +1,103 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+static void upload_vs_unit( struct brw_context *brw )
+{
+   struct brw_vs_unit_state vs;
+
+   memset(&vs, 0, sizeof(vs));
+
+   /* CACHE_NEW_VS_PROG */
+   vs.thread0.kernel_start_pointer = brw->vs.prog_gs_offset >> 6;
+   vs.thread0.grf_reg_count = align(brw->vs.prog_data->total_grf, 16) / 16 - 1;
+   vs.thread3.urb_entry_read_length = brw->vs.prog_data->urb_read_length;
+   vs.thread3.const_urb_entry_read_length = brw->vs.prog_data->curb_read_length;
+   vs.thread3.dispatch_grf_start_reg = 1;
+
+
+   /* BRW_NEW_URB_FENCE  */
+   vs.thread4.nr_urb_entries = brw->urb.nr_vs_entries;
+   vs.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+   vs.thread4.max_threads = MIN2(
+      MAX2(0, (brw->urb.nr_vs_entries - 6) / 2 - 1),
+      15);
+
+
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      vs.thread4.max_threads = 0;
+
+   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
+   if (0 /*brw->attribs.Clip->ClipPlanesEnabled*/) {
+      /* Note that we read in the userclip planes as well, hence
+       * clip_start:
+       */
+      vs.thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
+   }
+   else {
+      vs.thread3.const_urb_entry_read_offset = brw->curbe.vs_start * 2;
+   }
+
+   vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   vs.thread3.urb_entry_read_offset = 0;
+
+   /* No samplers for ARB_vp programs:
+    */
+   vs.vs5.sampler_count = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      vs.thread4.stats_enable = 1;
+
+   /* Vertex program always enabled:
+    */
+   vs.vs6.vs_enable = 1;
+
+   brw->vs.state_gs_offset = brw_cache_data( &brw->cache[BRW_VS_UNIT], &vs );
+}
+
+
+const struct brw_tracked_state brw_vs_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_CLIP |
+		BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .update = upload_vs_unit
+};
diff --git a/src/gallium/drivers/i965simple/brw_winsys.h b/src/gallium/drivers/i965simple/brw_winsys.h
new file mode 100644
index 0000000000..ec1e400418
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_winsys.h
@@ -0,0 +1,209 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * This is the interface that i965simple requires any window system
+ * hosting it to implement.  This is the only include file in i965simple
+ * which is public.
+ *
+ */
+
+#ifndef BRW_WINSYS_H
+#define BRW_WINSYS_H
+
+
+#include "pipe/p_defines.h"
+
+
+/* Pipe drivers are (meant to be!) independent of both GL and the
+ * window system.  The window system provides a buffer manager and a
+ * set of additional hooks for things like command buffer submission,
+ * etc.
+ *
+ * There clearly has to be some agreement between the window system
+ * driver and the hardware driver about the format of command buffers,
+ * etc.
+ */
+
+struct pipe_buffer;
+struct pipe_fence_handle;
+struct pipe_winsys;
+struct pipe_screen;
+
+
+/* The pipe driver currently understands the following chipsets:
+ */
+#define PCI_CHIP_I965_G			0x29A2
+#define PCI_CHIP_I965_Q			0x2992
+#define PCI_CHIP_I965_G_1		0x2982
+#define PCI_CHIP_I965_GM                0x2A02
+#define PCI_CHIP_I965_GME               0x2A12
+
+
+/* These are the names of all the state caches managed by the driver.
+ * 
+ * When data is uploaded to a buffer with buffer_subdata, we use the
+ * special version of that function below so that information about
+ * what type of data this is can be passed to the winsys backend.
+ * That in turn allows the correct flags to be set in the aub file
+ * dump to allow human-readable file dumps later on.
+ */
+
+enum brw_cache_id {
+   BRW_CC_VP,
+   BRW_CC_UNIT,
+   BRW_WM_PROG,
+   BRW_SAMPLER_DEFAULT_COLOR,
+   BRW_SAMPLER,
+   BRW_WM_UNIT,
+   BRW_SF_PROG,
+   BRW_SF_VP,
+   BRW_SF_UNIT,
+   BRW_VS_UNIT,
+   BRW_VS_PROG,
+   BRW_GS_UNIT,
+   BRW_GS_PROG,
+   BRW_CLIP_VP,
+   BRW_CLIP_UNIT,
+   BRW_CLIP_PROG,
+   BRW_SS_SURFACE,
+   BRW_SS_SURF_BIND,
+
+   BRW_MAX_CACHE
+};
+
+#define BRW_CONSTANT_BUFFER BRW_MAX_CACHE
+
+/**
+ * Additional winsys interface for i965simple.
+ *
+ * It is an over-simple batchbuffer mechanism.  Will want to improve the
+ * performance of this, perhaps based on the cmdstream stuff.  It
+ * would be pretty impossible to implement swz on top of this
+ * interface.
+ *
+ * Will also need additions/changes to implement static/dynamic
+ * indirect state.
+ */
+struct brw_winsys {
+
+   void (*destroy)(struct brw_winsys *);
+   
+   /**
+    * Reserve space on batch buffer.
+    *
+    * Returns a null pointer if there is insufficient space in the batch buffer
+    * to hold the requested number of dwords and relocations.
+    *
+    * The number of dwords should also include the number of relocations.
+    */
+   unsigned *(*batch_start)(struct brw_winsys *sws,
+                            unsigned dwords,
+                            unsigned relocs);
+
+   void (*batch_dword)(struct brw_winsys *sws,
+                       unsigned dword);
+
+   /**
+    * Emit a relocation to a buffer.
+    *
+    * Used not only when the buffer addresses are not pinned, but also to
+    * ensure refered buffers will not be destroyed until the current batch
+    * buffer execution is finished.
+    *
+    * The access flags is a combination of I915_BUFFER_ACCESS_WRITE and
+    * I915_BUFFER_ACCESS_READ macros.
+    */
+   void (*batch_reloc)(struct brw_winsys *sws,
+                       struct pipe_buffer *buf,
+                       unsigned access_flags,
+                       unsigned delta);
+
+
+   /* Not used yet, but really want this:
+    */
+   void (*batch_end)( struct brw_winsys *sws );
+
+   /**
+    * Flush the batch buffer.
+    *
+    * Fence argument must point to NULL or to a previous fence, and the caller
+    * must call fence_reference when done with the fence.
+    */
+   void (*batch_flush)(struct brw_winsys *sws,
+                       struct pipe_fence_handle **fence);
+
+
+   /* A version of buffer_subdata that includes information for the
+    * simulator:
+    */
+   void (*buffer_subdata_typed)(struct brw_winsys *sws, 
+				struct pipe_buffer *buf,
+				unsigned long offset, 
+				unsigned long size, 
+				const void *data,
+				unsigned data_type);
+   
+
+   /* A cheat so we don't have to think about relocations in a couple
+    * of places yet:
+    */
+   unsigned (*get_buffer_offset)( struct brw_winsys *sws,
+				  struct pipe_buffer *buf,
+				  unsigned flags );
+
+};
+
+#define BRW_BUFFER_ACCESS_WRITE   0x1
+#define BRW_BUFFER_ACCESS_READ    0x2
+
+#define BRW_BUFFER_USAGE_LIT_VERTEX  (PIPE_BUFFER_USAGE_CUSTOM << 0)
+
+
+struct pipe_context *brw_create(struct pipe_screen *,
+                                struct brw_winsys *,
+                                unsigned pci_id);
+
+static inline boolean brw_batchbuffer_data(struct brw_winsys *winsys,
+                                           const void *data,
+                                           unsigned bytes)
+{
+   static const unsigned incr = sizeof(unsigned);
+   uint i;
+   const unsigned *udata = (const unsigned*)(data);
+   unsigned size = bytes/incr;
+
+   winsys->batch_start(winsys, size, 0);
+   for (i = 0; i < size; ++i) {
+      winsys->batch_dword(winsys, udata[i]);
+   }
+   winsys->batch_end(winsys);
+
+   return (i == size);
+}
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_wm.c b/src/gallium/drivers/i965simple/brw_wm.c
new file mode 100644
index 0000000000..10161f2d2f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm.c
@@ -0,0 +1,209 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+#include "brw_eu.h"
+#include "brw_state.h"
+#include "util/u_memory.h"
+
+
+
+static void do_wm_prog( struct brw_context *brw,
+			struct brw_fragment_program *fp,
+			struct brw_wm_prog_key *key)
+{
+   struct brw_wm_compile *c = CALLOC_STRUCT(brw_wm_compile);
+   const unsigned *program;
+   unsigned program_size;
+
+   c->key = *key;
+   c->fp = fp;
+   
+   c->delta_xy[0] = brw_null_reg();
+   c->delta_xy[1] = brw_null_reg();
+   c->pixel_xy[0] = brw_null_reg();
+   c->pixel_xy[1] = brw_null_reg();
+   c->pixel_w = brw_null_reg();
+
+
+   debug_printf("XXXXXXXX FP\n");
+   
+   brw_wm_glsl_emit(c);
+
+   /* get the program
+    */
+   program = brw_get_program(&c->func, &program_size);
+
+   /*
+    */
+   brw->wm.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_WM_PROG],
+					      &c->key,
+					      sizeof(c->key),
+					      program,
+					      program_size,
+					      &c->prog_data,
+					      &brw->wm.prog_data );
+
+   FREE(c);
+}
+
+
+
+static void brw_wm_populate_key( struct brw_context *brw,
+				 struct brw_wm_prog_key *key )
+{
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *)brw->attribs.FragmentProgram;
+   unsigned lookup = 0;
+   unsigned line_aa;
+   
+   memset(key, 0, sizeof(*key));
+
+   /* Build the index for table lookup
+    */
+   /* BRW_NEW_DEPTH_STENCIL */
+   if (fp->info.uses_kill ||
+       brw->attribs.DepthStencil->alpha.enabled)
+      lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (fp->info.writes_z)
+      lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+
+   if (brw->attribs.DepthStencil->depth.enabled)
+      lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
+
+   if (brw->attribs.DepthStencil->depth.enabled &&
+       brw->attribs.DepthStencil->depth.writemask) /* ?? */
+      lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+
+   if (brw->attribs.DepthStencil->stencil[0].enabled) {
+      lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
+
+      if (brw->attribs.DepthStencil->stencil[0].writemask ||
+	  brw->attribs.DepthStencil->stencil[1].writemask)
+	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
+   }
+
+   /* XXX: when should this be disabled?
+    */
+   if (1)
+      lookup |= IZ_EARLY_DEPTH_TEST_BIT;
+
+
+   line_aa = AA_NEVER;
+
+   /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
+   if (brw->attribs.Raster->line_smooth) {
+      if (brw->reduced_primitive == PIPE_PRIM_LINES) {
+	 line_aa = AA_ALWAYS;
+      }
+      else if (brw->reduced_primitive == PIPE_PRIM_TRIANGLES) {
+	 if (brw->attribs.Raster->fill_ccw == PIPE_POLYGON_MODE_LINE) {
+	    line_aa = AA_SOMETIMES;
+
+	    if (brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_LINE ||
+		(brw->attribs.Raster->cull_mode == PIPE_WINDING_CW))
+	       line_aa = AA_ALWAYS;
+	 }
+	 else if (brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_LINE) {
+	    line_aa = AA_SOMETIMES;
+
+	    if (brw->attribs.Raster->cull_mode == PIPE_WINDING_CCW)
+	       line_aa = AA_ALWAYS;
+	 }
+      }
+   }
+
+   brw_wm_lookup_iz(line_aa,
+		    lookup,
+		    key);
+
+
+#if 0
+   /* BRW_NEW_SAMPLER 
+    *
+    * Not doing any of this at the moment:
+    */
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      const struct pipe_sampler_state *unit = brw->attribs.Samplers[i];
+
+      if (unit) {
+
+	 if (unit->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+	    key->shadowtex_mask |= 1<<i;
+	 }
+	 if (t->Image[0][t->BaseLevel]->InternalFormat == GL_YCBCR_MESA)
+	    key->yuvtex_mask |= 1<<i;
+      }
+   }
+#endif
+
+
+   /* Extra info:
+    */
+   key->program_string_id = fp->id;
+
+}
+
+
+static void brw_upload_wm_prog( struct brw_context *brw )
+{
+   struct brw_wm_prog_key key;
+   struct brw_fragment_program *fp = (struct brw_fragment_program *)
+      brw->attribs.FragmentProgram;
+
+   brw_wm_populate_key(brw, &key);
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache[BRW_WM_PROG],
+			&key, sizeof(key),
+			&brw->wm.prog_data,
+			&brw->wm.prog_gs_offset))
+      return;
+
+   do_wm_prog(brw, fp, &key);
+}
+
+
+const struct brw_tracked_state brw_wm_prog = {
+   .dirty = {
+      .brw   = (BRW_NEW_FS |
+		BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = 0
+   },
+   .update = brw_upload_wm_prog
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_wm.h b/src/gallium/drivers/i965simple/brw_wm.h
new file mode 100644
index 0000000000..b29c4393f0
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm.h
@@ -0,0 +1,142 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+              
+
+#ifndef BRW_WM_H
+#define BRW_WM_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution.  These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+#define IZ_PS_KILL_ALPHATEST_BIT    0x1
+#define IZ_PS_COMPUTES_DEPTH_BIT    0x2
+#define IZ_DEPTH_WRITE_ENABLE_BIT   0x4
+#define IZ_DEPTH_TEST_ENABLE_BIT    0x8
+#define IZ_STENCIL_WRITE_ENABLE_BIT 0x10
+#define IZ_STENCIL_TEST_ENABLE_BIT  0x20
+#define IZ_EARLY_DEPTH_TEST_BIT     0x40
+#define IZ_BIT_MAX                  0x80
+
+#define AA_NEVER     0
+#define AA_SOMETIMES 1
+#define AA_ALWAYS    2
+
+struct brw_wm_prog_key {
+   unsigned source_depth_reg:3;
+   unsigned aa_dest_stencil_reg:3;
+   unsigned dest_depth_reg:3;
+   unsigned nr_depth_regs:3;
+   unsigned shadowtex_mask:8;
+   unsigned computes_depth:1;	/* could be derived from program string */
+   unsigned source_depth_to_render_target:1;
+   unsigned runtime_check_aads_emit:1;
+
+   unsigned yuvtex_mask:8;
+
+   unsigned program_string_id;
+};
+
+
+
+
+
+#define PROGRAM_INTERNAL_PARAM
+#define MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS 1024 /* 72 for GL_ARB_f_p */
+#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + PIPE_MAX_ATTRIBS + 3)
+#define BRW_WM_MAX_GRF   128		/* hardware limit */
+#define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
+#define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
+#define BRW_WM_MAX_PARAM 256
+#define BRW_WM_MAX_CONST 256
+#define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
+
+#define PAYLOAD_DEPTH     (PIPE_MAX_ATTRIBS)
+
+#define MAX_IFSN 32
+#define MAX_LOOP_DEPTH 32
+
+struct brw_wm_compile {
+   struct brw_compile func;
+   struct brw_wm_prog_key key;
+   struct brw_wm_prog_data prog_data; /* result */
+
+   struct brw_fragment_program *fp;
+
+   unsigned grf_limit;
+   unsigned max_wm_grf;
+
+
+   struct brw_reg pixel_xy[2];
+   struct brw_reg delta_xy[2];
+   struct brw_reg pixel_w;
+
+
+   struct brw_reg wm_regs[8][32][4];
+
+   struct brw_reg payload_depth[4];
+   struct brw_reg payload_coef[16];
+
+   struct brw_reg emit_mask_reg;
+
+   struct brw_instruction *if_inst[MAX_IFSN];
+   int if_insn;
+
+   struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+   int loop_insn;
+
+   struct brw_instruction *inst0;
+   struct brw_instruction *inst1;
+
+   struct brw_reg stack;
+   struct brw_indirect stack_index;
+
+   unsigned reg_index;
+
+   unsigned tmp_start;
+   unsigned tmp_index;
+};
+
+
+
+void brw_wm_lookup_iz( unsigned line_aa,
+		       unsigned lookup,
+		       struct brw_wm_prog_key *key );
+
+void brw_wm_glsl_emit(struct brw_wm_compile *c);
+void brw_wm_emit_decls(struct brw_wm_compile *c);
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c
new file mode 100644
index 0000000000..d50e66f613
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_decl.c
@@ -0,0 +1,392 @@
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+   c->tmp_index++;
+   c->reg_index = MAX2(c->reg_index, c->tmp_start + c->tmp_index);
+   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
+}
+
+static void release_tmps(struct brw_wm_compile *c)
+{
+   c->tmp_index = 0;
+}
+
+
+
+static int is_null( struct brw_reg reg )
+{
+   return (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+	   reg.nr == BRW_ARF_NULL);
+}
+
+static void emit_pixel_xy( struct brw_wm_compile *c )
+{
+   if (is_null(c->pixel_xy[0])) {
+
+      struct brw_compile *p = &c->func;
+      struct brw_reg r1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+      c->pixel_xy[0] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
+      c->pixel_xy[1] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
+
+      /* Calculate pixel centers by adding 1 or 0 to each of the
+       * micro-tile coordinates passed in r1.
+       */
+      brw_ADD(p,
+	      c->pixel_xy[0],
+	      stride(suboffset(r1_uw, 4), 2, 4, 0),
+	      brw_imm_v(0x10101010));
+
+      brw_ADD(p,
+	      c->pixel_xy[1],
+	      stride(suboffset(r1_uw, 5), 2, 4, 0),
+	      brw_imm_v(0x11001100));
+   }
+}
+
+
+
+
+
+
+static void emit_delta_xy( struct brw_wm_compile *c )
+{
+   if (is_null(c->delta_xy[0])) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg r1 = brw_vec1_grf(1, 0);
+
+      emit_pixel_xy(c);
+
+      c->delta_xy[0] = alloc_tmp(c);
+      c->delta_xy[1] = alloc_tmp(c);
+
+      /* Calc delta X,Y by subtracting origin in r1 from the pixel
+       * centers.
+       */
+      brw_ADD(p,
+	      c->delta_xy[0],
+	      retype(c->pixel_xy[0], BRW_REGISTER_TYPE_UW),
+	      negate(r1));
+
+      brw_ADD(p,
+	      c->delta_xy[1],
+	      retype(c->pixel_xy[1], BRW_REGISTER_TYPE_UW),
+	      negate(suboffset(r1,1)));
+   }
+}
+
+
+
+#if 0
+static void emit_pixel_w( struct brw_wm_compile *c )
+{
+   if (is_null(c->pixel_w)) {
+      struct brw_compile *p = &c->func;
+
+      struct brw_reg interp_wpos = c->coef_wpos;
+      
+      c->pixel_w = alloc_tmp(c);
+
+      emit_delta_xy(c);
+
+      /* Calc 1/w - just linterp wpos[3] optimized by putting the
+       * result straight into a message reg.
+       */
+      struct brw_reg interp3 = brw_vec1_grf(interp_wpos.nr+1, 4);
+      brw_LINE(p, brw_null_reg(), interp3, c->delta_xy[0]);
+      brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), c->delta_xy[1]);
+
+      /* Calc w */
+      brw_math_16( p, 
+		   c->pixel_w,
+		   BRW_MATH_FUNCTION_INV,
+		   BRW_MATH_SATURATE_NONE,
+		   2, 
+		   brw_null_reg(),
+		   BRW_MATH_PRECISION_FULL);
+   }
+}
+#endif
+
+
+static void emit_cinterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
+	 brw_MOV(p, dst, suboffset(interp[i],3));
+      }
+   }
+}
+
+static void emit_linterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   emit_delta_xy(c);
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
+	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
+	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
+      }
+   }
+}
+
+#if 0
+static void emit_pinterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   get_delta_xy(c);
+   get_pixel_w(c);
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = allocate_reg(c, TGSI_FILE_INPUT, idx, i);
+	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
+	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
+	 brw_MUL(p, dst, dst, c->pixel_w);
+      }
+   }
+}
+#endif
+
+
+
+#if 0
+static void emit_wpos( )
+{ 
+   struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
+   struct tgsi_full_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
+   struct tgsi_full_src_register deltas = get_delta_xy(c);
+   struct tgsi_full_src_register arg2;
+   unsigned opcode;
+
+   opcode = WM_LINTERP;
+   arg2 = src_undef();
+
+   /* Have to treat wpos.xy specially:
+    */
+   emit_op(c,
+	   WM_WPOSXY,
+	   dst_mask(dst, WRITEMASK_XY),
+	   0, 0, 0,
+	   get_pixel_xy(c),
+	   src_undef(),
+	   src_undef());
+      
+   dst = dst_mask(dst, WRITEMASK_ZW);
+
+   /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
+    */
+   emit_op(c,
+	   WM_LINTERP,
+	   dst,
+	   0, 0, 0,
+	   interp,
+	   deltas,
+	   arg2);
+}
+#endif
+
+
+
+
+/* Perform register allocation:
+ * 
+ *  -- r0???
+ *  -- passthrough depth regs (and stencil/aa??)
+ *  -- curbe ??
+ *  -- inputs (coefficients)
+ *
+ * Use a totally static register allocation.  This will perform poorly
+ * but is an easy way to get started (again).
+ */
+static void prealloc_reg(struct brw_wm_compile *c)
+{
+   int i, j;
+   int nr_curbe_regs = 0;
+
+   /* R0, then some depth related regs:
+    */
+   for (i = 0; i < c->key.nr_depth_regs; i++) {
+      c->payload_depth[i] =  brw_vec8_grf(i*2, 0);
+      c->reg_index += 2;
+   }
+
+
+   /* Then a copy of our part of the CURBE entry:
+    */
+   {
+      int nr_constants = c->fp->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      int index = 0;
+
+      /* XXX number of constants, or highest numbered constant? */
+      assert(nr_constants == c->fp->info.file_count[TGSI_FILE_CONSTANT]);
+
+      c->prog_data.max_const = 4*nr_constants;
+      for (i = 0; i < nr_constants; i++) {
+	 for (j = 0; j < 4; j++, index++) 
+	    c->wm_regs[TGSI_FILE_CONSTANT][i][j] = brw_vec1_grf(c->reg_index + index/8,
+								index%8);
+      }
+
+      nr_curbe_regs = 2*((4*nr_constants+15)/16);
+      c->reg_index += nr_curbe_regs;
+   }
+
+   /* Adjust for parameter coefficients for position, which are
+    * currently always provided.
+    */
+//   c->position_coef[i] = brw_vec8_grf(c->reg_index, 0);
+   c->reg_index += 2;
+
+   /* Next we receive the plane coefficients for parameter
+    * interpolation:
+    */
+   assert(c->fp->info.file_max[TGSI_FILE_INPUT] == c->fp->info.num_inputs);
+   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
+      c->payload_coef[i] = brw_vec8_grf(c->reg_index, 0);
+      c->reg_index += 2;
+   }
+
+   c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+   c->prog_data.urb_read_length = (c->fp->info.num_inputs + 1) * 2;
+   c->prog_data.curb_read_length = nr_curbe_regs;
+
+   /* That's the end of the payload, now we can start allocating registers.
+    */
+   c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+   c->reg_index++;
+
+   c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+   c->reg_index += 2;
+
+   /* Now allocate room for the interpolated inputs and staging
+    * registers for the outputs:
+    */
+   /* XXX do we want to loop over the _number_ of inputs/outputs or loop
+    * to the highest input/output index that's used?
+    *  Probably the same, actually.
+    */
+   assert(c->fp->info.file_max[TGSI_FILE_INPUT] + 1 == c->fp->info.num_inputs);
+   assert(c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1 == c->fp->info.num_outputs);
+   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) 
+      for (j = 0; j < 4; j++)
+	 c->wm_regs[TGSI_FILE_INPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
+
+   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1; i++) 
+      for (j = 0; j < 4; j++)
+	 c->wm_regs[TGSI_FILE_OUTPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
+
+   /* Beyond this we should only need registers for internal temporaries:
+    */
+   c->tmp_start = c->reg_index;
+}
+
+
+
+
+
+/* Need to interpolate fragment program inputs in as a preamble to the
+ * shader.  A more sophisticated compiler would do this on demand, but
+ * we'll do it up front:
+ */
+void brw_wm_emit_decls(struct brw_wm_compile *c)
+{
+   struct tgsi_parse_context parse;
+   int done = 0;
+
+   prealloc_reg(c);
+
+   tgsi_parse_init( &parse, c->fp->program.tokens );
+
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+	 unsigned first = decl->DeclarationRange.First;
+	 unsigned last = decl->DeclarationRange.Last;
+	 unsigned mask = decl->Declaration.UsageMask; /* ? */
+	 unsigned i;
+
+	 if (decl->Declaration.File != TGSI_FILE_INPUT)
+	    break;
+
+	 for( i = first; i <= last; i++ ) {
+	    switch (decl->Declaration.Interpolate) {
+	    case TGSI_INTERPOLATE_CONSTANT:
+	       emit_cinterp(c, i, mask);
+	       break;
+
+	    case TGSI_INTERPOLATE_LINEAR:
+	       emit_linterp(c, i, mask);
+	       break;
+
+	    case TGSI_INTERPOLATE_PERSPECTIVE:
+	       //emit_pinterp(c, i, mask);
+	       emit_linterp(c, i, mask);
+	       break;
+	    }
+	 }
+	 break;
+      }
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      default:
+         done = 1;
+	 break;
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+   release_tmps(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
new file mode 100644
index 0000000000..ab6410aa60
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c
@@ -0,0 +1,1076 @@
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+static int get_scalar_dst_index(struct tgsi_full_instruction *inst)
+{
+   struct tgsi_dst_register dst = inst->FullDstRegisters[0].DstRegister;
+   int i;
+   for (i = 0; i < 4; i++)
+      if (dst.WriteMask & (1<<i))
+	 break;
+   return i;
+}
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+   c->tmp_index++;
+   c->reg_index = MAX2(c->reg_index, c->tmp_index);
+   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
+}
+
+static void release_tmps(struct brw_wm_compile *c)
+{
+   c->tmp_index = 0;
+}
+
+
+static struct brw_reg
+get_reg(struct brw_wm_compile *c, int file, int index, int component )
+{
+   switch (file) {
+   case TGSI_FILE_NULL:
+      return brw_null_reg();
+
+   case TGSI_FILE_SAMPLER:
+      /* Should never get here:
+       */
+      assert (0);	       
+      return brw_null_reg();
+
+   case TGSI_FILE_IMMEDIATE:
+      /* These need a different path:
+       */
+      assert(0);
+      return brw_null_reg();
+
+       
+   case TGSI_FILE_CONSTANT:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_ADDRESS:
+      return c->wm_regs[file][index][component];
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+				  struct tgsi_full_instruction *inst, 
+				  int component)
+{
+   return get_reg(c, 
+		  inst->FullDstRegisters[0].DstRegister.File, 
+		  inst->FullDstRegisters[0].DstRegister.Index,
+		  component);
+}
+
+static int get_swz( struct tgsi_src_register src, int index )
+{
+   switch (index & 3) {
+   case 0: return src.SwizzleX;
+   case 1: return src.SwizzleY;
+   case 2: return src.SwizzleZ;
+   case 3: return src.SwizzleW;
+   default: return 0;
+   }
+}
+
+static int get_ext_swz( struct tgsi_src_register_ext_swz src, int index )
+{
+   switch (index & 3) {
+   case 0: return src.ExtSwizzleX;
+   case 1: return src.ExtSwizzleY;
+   case 2: return src.ExtSwizzleZ;
+   case 3: return src.ExtSwizzleW;
+   default: return 0;
+   }
+}
+
+static struct brw_reg get_src_reg(struct brw_wm_compile *c,
+				  struct tgsi_full_src_register *src, 
+				  int index)
+{
+   struct brw_reg reg;
+   int component = index;
+   int neg = 0;
+   int abs = 0;
+
+   if (src->SrcRegister.Negate)
+      neg = 1;
+
+   component = get_swz(src->SrcRegister, component);
+
+   /* Yes, there are multiple negates:
+    */
+   switch (component & 3) {
+   case 0: neg ^= src->SrcRegisterExtSwz.NegateX; break;
+   case 1: neg ^= src->SrcRegisterExtSwz.NegateY; break;
+   case 2: neg ^= src->SrcRegisterExtSwz.NegateZ; break;
+   case 3: neg ^= src->SrcRegisterExtSwz.NegateW; break;
+   }
+
+   /* And multiple swizzles, fun isn't it:
+    */
+   component = get_ext_swz(src->SrcRegisterExtSwz, component);
+
+   /* Not handling indirect lookups yet:
+    */
+   assert(src->SrcRegister.Indirect == 0);
+
+   /* Don't know what dimension means:
+    */
+   assert(src->SrcRegister.Dimension == 0);
+
+   /* Will never handle any of this stuff: 
+    */
+   assert(src->SrcRegisterExtMod.Complement == 0);
+   assert(src->SrcRegisterExtMod.Bias == 0);
+   assert(src->SrcRegisterExtMod.Scale2X == 0);
+
+   if (src->SrcRegisterExtMod.Absolute)
+      abs = 1;
+
+   /* Another negate!  This is a post-absolute negate, which we
+    * can't do.  Need to clean the crap out of tgsi somehow.
+    */
+   assert(src->SrcRegisterExtMod.Negate == 0);
+
+   switch( component ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      reg = get_reg(c, 
+		    src->SrcRegister.File, 
+		    src->SrcRegister.Index, 
+		    component );
+
+      if (neg) 
+	 reg = negate(reg);
+   
+      if (abs)
+	 reg = brw_abs(reg);
+
+      break;
+
+      /* XXX: this won't really work in the general case, but we know
+       * that the extended swizzle is only allowed in the SWZ
+       * instruction (right??), in which case using an immediate
+       * directly will work.
+       */
+   case TGSI_EXTSWIZZLE_ZERO:
+      reg = brw_imm_f(0);
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      if (neg && !abs)
+	 reg = brw_imm_f(-1.0);
+      else
+	 reg = brw_imm_f(1.0);
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+    
+   return reg;
+}
+
+static void emit_abs( struct brw_wm_compile *c,
+		      struct tgsi_full_instruction *inst)
+{
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+
+   int i;
+   struct brw_compile *p = &c->func;
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 struct brw_reg src, dst;
+	 dst = get_dst_reg(c, inst, i);
+	 src = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 brw_MOV(p, dst, brw_abs(src)); /* NOTE */
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_xpd(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   int i;
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   for (i = 0; i < 4; i++) {
+      unsigned i2 = (i+2)%3;
+      unsigned i1 = (i+1)%3;
+      if (mask & (1<<i)) {
+	 struct brw_reg src0, src1, dst;
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = negate(get_src_reg(c, &inst->FullSrcRegisters[0], i2));
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i1);
+	 brw_MUL(p, brw_null_reg(), src0, src1);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i1);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i2);
+	 brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+	 brw_MAC(p, dst, src0, src1);
+	 brw_set_saturate(p, 0);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+static void emit_dp3(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_reg src0[3], src1[3], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 3; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_MAC(p, dst, src0[2], src1[2]);
+   brw_set_saturate(p, 0);
+}
+
+static void emit_dp4(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_reg src0[4], src1[4], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 4; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_MAC(p, dst, src0[3], src1[3]);
+   brw_set_saturate(p, 0);
+}
+
+static void emit_dph(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_reg src0[4], src1[4], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 4; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_MAC(p, dst, src0[2], src1[2]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_ADD(p, dst, src0[3], src1[3]);
+   brw_set_saturate(p, 0);
+}
+
+static void emit_math1(struct brw_wm_compile *c,
+		       struct tgsi_full_instruction *inst, unsigned func)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, dst;
+
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MOV(p, brw_message_reg(2), src0);
+   brw_math(p,
+	    dst,
+	    func,
+	    ((inst->Instruction.Saturate != TGSI_SAT_NONE) 
+	     ? BRW_MATH_SATURATE_SATURATE 
+	     : BRW_MATH_SATURATE_NONE),
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+
+static void emit_alu2(struct brw_wm_compile *c,		      
+		      struct tgsi_full_instruction *inst,
+		      unsigned opcode)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, src1, dst;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   for (i = 0 ; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_alu2(p, opcode, dst, src0, src1);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_alu1(struct brw_wm_compile *c,
+		      struct tgsi_full_instruction *inst,
+		      unsigned opcode)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, dst;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   for (i = 0 ; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 brw_alu1(p, opcode, dst, src0);
+      }
+   }
+   if (inst->Instruction.Saturate != TGSI_SAT_NONE)
+      brw_set_saturate(p, 0);
+}
+
+
+static void emit_max(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg src0, src1, dst;
+   int i;
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MOV(p, dst, src0);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, src1);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+   brw_pop_insn_state(p);
+}
+
+static void emit_min(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg src0, src1, dst;
+   int i;
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MOV(p, dst, src0);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, src1);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+   brw_pop_insn_state(p);
+}
+
+static void emit_pow(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg dst, src0, src1;
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   src1 = get_src_reg(c, &inst->FullSrcRegisters[1], 0);
+
+   brw_MOV(p, brw_message_reg(2), src0);
+   brw_MOV(p, brw_message_reg(3), src1);
+
+   brw_math(p,
+	    dst,
+	    BRW_MATH_FUNCTION_POW,
+	    (inst->Instruction.Saturate != TGSI_SAT_NONE 
+	     ? BRW_MATH_SATURATE_SATURATE 
+	     : BRW_MATH_SATURATE_NONE),
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_lrp(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
+   int i;
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+
+	 if (src1.nr == dst.nr) {
+	    tmp1 = alloc_tmp(c);
+	    brw_MOV(p, tmp1, src1);
+	 } else
+	    tmp1 = src1;
+
+	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
+	 if (src2.nr == dst.nr) {
+	    tmp2 = alloc_tmp(c);
+	    brw_MOV(p, tmp2, src2);
+	 } else
+	    tmp2 = src2;
+
+	 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
+	 brw_MUL(p, brw_null_reg(), dst, tmp2);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MAC(p, dst, src0, tmp1);
+	 brw_set_saturate(p, 0);
+      }
+      release_tmps(c);
+   }
+}
+
+static void emit_kil(struct brw_wm_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+   brw_AND(p, depth, c->emit_mask_reg, depth);
+   brw_pop_insn_state(p);
+}
+
+static void emit_mad(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, src0, src1, src2;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
+	 brw_MUL(p, dst, src0, src1);
+
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_ADD(p, dst, dst, src2);
+	 brw_set_saturate(p, 0);
+      }
+   }
+}
+
+static void emit_sop(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst, unsigned cond)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, src0, src1;
+   int i;
+
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_CMP(p, brw_null_reg(), cond, src0, src1);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 brw_MOV(p, dst, brw_imm_f(0.0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, brw_imm_f(1.0));
+      }
+   }
+   brw_pop_insn_state(p);
+}
+
+
+static void emit_ddx(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg interp[4];
+   struct brw_reg dst;
+   struct brw_reg src0, w;
+   unsigned nr, i;
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
+   nr = src0.nr;
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 brw_MOV(p, dst, interp[i]);
+	 brw_MUL(p, dst, dst, w);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+static void emit_ddy(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg interp[4];
+   struct brw_reg dst;
+   struct brw_reg src0, w;
+   unsigned nr, i;
+
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   nr = src0.nr;
+   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 brw_MOV(p, dst, suboffset(interp[i], 1));
+	 brw_MUL(p, dst, dst, w);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+/* TODO
+   BIAS on SIMD8 not workind yet...
+*/
+static void emit_txb(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg payload_reg = c->payload_depth[0];
+   struct brw_reg dst[4], src[4];
+   unsigned i;
+   for (i = 0; i < 4; i++)
+      dst[i] = get_dst_reg(c, inst, i);
+   for (i = 0; i < 4; i++)
+      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+
+#if 0
+   switch (inst->TexSrcTarget) {
+   case TEXTURE_1D_INDEX:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      break;
+   case TEXTURE_2D_INDEX:
+   case TEXTURE_RECT_INDEX:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), src[1]);
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      break;
+   default:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), src[1]);
+      brw_MOV(p, brw_message_reg(4), src[2]);
+      break;
+   }
+#else
+   brw_MOV(p, brw_message_reg(2), src[0]);
+   brw_MOV(p, brw_message_reg(3), src[1]);
+   brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+#endif
+
+   brw_MOV(p, brw_message_reg(5), src[3]);
+   brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+   brw_SAMPLE(p,
+	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	      inst->TexSrcUnit + 1, /* surface */
+	      inst->TexSrcUnit,     /* sampler */
+	      inst->FullDstRegisters[0].DstRegister.WriteMask,
+	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
+	      4,
+	      4,
+	      0);
+#endif
+}
+
+static void emit_tex(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg payload_reg = c->payload_depth[0];
+   struct brw_reg dst[4], src[4];
+   unsigned msg_len;
+   unsigned i, nr;
+   unsigned emit;
+   boolean shadow = (c->key.shadowtex_mask & (1<<inst->TexSrcUnit)) ? 1 : 0;
+
+   for (i = 0; i < 4; i++)
+      dst[i] = get_dst_reg(c, inst, i);
+   for (i = 0; i < 4; i++)
+      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+
+#if 0
+   switch (inst->TexSrcTarget) {
+   case TEXTURE_1D_INDEX:
+      emit = WRITEMASK_X;
+      nr = 1;
+      break;
+   case TEXTURE_2D_INDEX:
+   case TEXTURE_RECT_INDEX:
+      emit = WRITEMASK_XY;
+      nr = 2;
+      break;
+   default:
+      emit = WRITEMASK_XYZ;
+      nr = 3;
+      break;
+   }
+#else
+   emit = WRITEMASK_XY;
+   nr = 2;
+#endif
+
+   msg_len = 1;
+
+   for (i = 0; i < nr; i++) {
+      static const unsigned swz[4] = {0,1,2,2};
+      if (emit & (1<<i))
+	 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
+      else
+	 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
+      msg_len += 1;
+   }
+
+   if (shadow) {
+      brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(6), src[2]);
+   }
+
+   brw_SAMPLE(p,
+	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	      inst->TexSrcUnit + 1, /* surface */
+	      inst->TexSrcUnit,     /* sampler */
+	      inst->FullDstRegisters[0].DstRegister.WriteMask,
+	      BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
+	      4,
+	      shadow ? 6 : 4,
+	      0);
+
+   if (shadow)
+      brw_MOV(p, dst[3], brw_imm_f(1.0));
+#endif
+}
+
+
+
+
+
+
+
+
+static void emit_fb_write(struct brw_wm_compile *c,
+			  struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   int nr = 2;
+   int channel;
+   int base_reg = 0;
+
+   // src0 = output color
+   // src1 = payload_depth[0]
+   // src2 = output depth
+   // dst = ???
+
+
+
+   /* Reserve a space for AA - may not be needed:
+    */
+   if (c->key.aa_dest_stencil_reg)
+      nr += 1;
+
+   {
+      brw_push_insn_state(p);
+      for (channel = 0; channel < 4; channel++) {
+	 struct brw_reg src0 = c->wm_regs[TGSI_FILE_OUTPUT][0][channel];
+
+	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+	 brw_MOV(p, brw_message_reg(nr + channel), src0);
+      }
+      /* skip over the regs populated above: */
+      nr += 8;
+      brw_pop_insn_state(p);
+   }
+    
+
+   /* Pass through control information:
+    */
+   /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+      brw_MOV(p,
+	      brw_message_reg(base_reg + 1),
+	      brw_vec8_grf(1, 0));
+      brw_pop_insn_state(p);
+   }
+
+   /* Send framebuffer write message: */
+   brw_fb_WRITE(p,
+		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+		base_reg,
+		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+		0,              /* render surface always 0 */
+		nr,
+		0,
+		1);
+
+}
+
+
+static void brw_wm_emit_instruction( struct brw_wm_compile *c,
+				     struct tgsi_full_instruction *inst )
+{
+   struct brw_compile *p = &c->func;
+
+#if 0   
+   if (inst->CondUpdate)
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   else
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+#else
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+#endif
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      emit_abs(c, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+      emit_alu2(c, inst, BRW_OPCODE_ADD);
+      break;
+   case TGSI_OPCODE_SUB:
+      assert(0);
+//      emit_alu2(c, inst, BRW_OPCODE_SUB);
+      break;
+   case TGSI_OPCODE_FRC:
+      emit_alu1(c, inst, BRW_OPCODE_FRC);
+      break;
+   case TGSI_OPCODE_FLR:
+      assert(0);
+//      emit_alu1(c, inst, BRW_OPCODE_FLR);
+      break;
+   case TGSI_OPCODE_LRP:
+      emit_lrp(c, inst);
+      break;
+   case TGSI_OPCODE_INT:
+      emit_alu1(c, inst, BRW_OPCODE_RNDD);
+      break;
+   case TGSI_OPCODE_MOV:
+      emit_alu1(c, inst, BRW_OPCODE_MOV);
+      break;
+   case TGSI_OPCODE_DP3:
+      emit_dp3(c, inst);
+      break;
+   case TGSI_OPCODE_DP4:
+      emit_dp4(c, inst);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(c, inst);
+      break;
+   case TGSI_OPCODE_DPH:
+      emit_dph(c, inst);
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
+      break;
+   case TGSI_OPCODE_SIN:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
+      break;
+   case TGSI_OPCODE_COS:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(c, inst);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(c, inst);
+      break;
+   case TGSI_OPCODE_DDX:
+      emit_ddx(c, inst);
+      break;
+   case TGSI_OPCODE_DDY:
+      emit_ddy(c, inst);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_sop(c, inst, BRW_CONDITIONAL_L);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sop(c, inst, BRW_CONDITIONAL_LE);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sop(c, inst, BRW_CONDITIONAL_G);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sop(c, inst, BRW_CONDITIONAL_GE);
+      break;
+   case TGSI_OPCODE_SEQ:
+      emit_sop(c, inst, BRW_CONDITIONAL_EQ);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
+      break;
+   case TGSI_OPCODE_MUL:
+      emit_alu2(c, inst, BRW_OPCODE_MUL);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_pow(c, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+      emit_mad(c, inst);
+      break;
+   case TGSI_OPCODE_TEX:
+      emit_tex(c, inst);
+      break;
+   case TGSI_OPCODE_TXB:
+      emit_txb(c, inst);
+      break;
+   case TGSI_OPCODE_TEXKILL:
+      emit_kil(c);
+      break;
+   case TGSI_OPCODE_IF:
+      assert(c->if_insn < MAX_IFSN);
+      c->if_inst[c->if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_ELSE:
+      c->if_inst[c->if_insn-1]  = brw_ELSE(p, c->if_inst[c->if_insn-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(c->if_insn > 0);
+      brw_ENDIF(p, c->if_inst[--c->if_insn]);
+      break;
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1ud(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, 
+	      get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(4));
+//      orig_inst = inst->Data;
+//      orig_inst->Data = &p->store[p->nr_insn];
+      assert(0);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_pop_insn_state(p);
+      break;
+
+   case TGSI_OPCODE_RET:
+#if 0
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_ADD(p, 
+	      get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1ud(c->stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_pop_insn_state(p);
+#else
+      emit_fb_write(c, inst);
+#endif
+
+      break;
+   case TGSI_OPCODE_LOOP:
+      c->loop_inst[c->loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_BRK:
+      brw_BREAK(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CONT:
+      brw_CONT(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      c->loop_insn--;
+      c->inst0 = c->inst1 = brw_WHILE(p, c->loop_inst[c->loop_insn]);
+      /* patch all the BREAK instructions from
+	 last BEGINLOOP */
+      while (c->inst0 > c->loop_inst[c->loop_insn]) {
+	 c->inst0--;
+	 if (c->inst0->header.opcode == BRW_OPCODE_BREAK) {
+	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0 + 1;
+	    c->inst0->bits3.if_else.pop_count = 0;
+	 } else if (c->inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0;
+	    c->inst0->bits3.if_else.pop_count = 0;
+	 }
+      }
+      break;
+   case TGSI_OPCODE_END:
+      emit_fb_write(c, inst);
+      break;
+
+   default:
+      debug_printf("unsupported IR in fragment shader %d\n",
+		   inst->Instruction.Opcode);
+   }
+#if 0
+   if (inst->CondUpdate)
+      brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+   else
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+#endif
+}
+
+
+
+
+
+
+void brw_wm_glsl_emit(struct brw_wm_compile *c)
+{
+   struct tgsi_parse_context parse;
+   struct brw_compile *p = &c->func;
+
+   brw_init_compile(&c->func);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   c->reg_index = 0;
+   c->if_insn = 0;
+   c->loop_insn = 0;
+   c->stack_index = brw_indirect(0,0);
+
+   /* Do static register allocation and parameter interpolation:
+    */
+   brw_wm_emit_decls( c );
+
+   /* Emit the actual program.  All done with very direct translation,
+    * hopefully we can improve on this shortly...
+    */
+   brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
+
+   tgsi_parse_init( &parse, c->fp->program.tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 /* already done */
+	 break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* not handled yet */
+	 assert(0);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         brw_wm_emit_instruction(c, &parse.FullToken.FullInstruction);
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+   /* Fix up call targets:
+    */
+#if 0
+   {
+      unsigned nr_insns = c->fp->program.Base.NumInstructions;
+      unsigned insn, target_insn;
+      struct tgsi_full_instruction *inst1, *inst2;
+      struct brw_instruction *brw_inst1, *brw_inst2;
+      int offset;
+      for (insn = 0; insn < nr_insns; insn++) {
+	 inst1 = &c->fp->program.Base.Instructions[insn];
+	 brw_inst1 = inst1->Data;
+	 switch (inst1->Opcode) {
+	 case TGSI_OPCODE_CAL:
+	    target_insn = inst1->BranchTarget;
+	    inst2 = &c->fp->program.Base.Instructions[target_insn];
+	    brw_inst2 = inst2->Data;
+	    offset = brw_inst2 - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 default:
+	    break;
+	 }
+      }
+   }
+#endif
+
+   c->prog_data.total_grf = c->reg_index;
+   c->prog_data.total_scratch = 0;
+}
diff --git a/src/gallium/drivers/i965simple/brw_wm_iz.c b/src/gallium/drivers/i965simple/brw_wm_iz.c
new file mode 100644
index 0000000000..6c5f25bf39
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_iz.c
@@ -0,0 +1,214 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_wm.h"
+
+
+#undef P			/* prompted depth */
+#undef C			/* computed */
+#undef N			/* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+const struct {
+   unsigned mode:2;
+   unsigned sd_present:1;
+   unsigned sd_to_rt:1;
+   unsigned dd_present:1;
+   unsigned ds_present:1;
+} wm_iz_table[IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 0, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 } 
+};
+
+void brw_wm_lookup_iz( unsigned line_aa,
+		       unsigned lookup,
+		       struct brw_wm_prog_key *key )
+{
+   unsigned reg = 2;
+
+   assert (lookup < IZ_BIT_MAX);
+      
+   if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
+      key->computes_depth = 1;
+
+   if (wm_iz_table[lookup].sd_present) {
+      key->source_depth_reg = reg;
+      reg += 2;
+   }
+
+   if (wm_iz_table[lookup].sd_to_rt)
+      key->source_depth_to_render_target = 1;
+
+   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
+      key->aa_dest_stencil_reg = reg;
+      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
+				      line_aa == AA_SOMETIMES);
+      reg++;
+   }
+
+   if (wm_iz_table[lookup].dd_present) {
+      key->dest_depth_reg = reg;
+      reg+=2;
+   }
+
+   key->nr_depth_regs = (reg+1)/2;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
new file mode 100644
index 0000000000..52b2909a65
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
@@ -0,0 +1,275 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+#define COMPAREFUNC_ALWAYS		0
+#define COMPAREFUNC_NEVER		0x1
+#define COMPAREFUNC_LESS		0x2
+#define COMPAREFUNC_EQUAL		0x3
+#define COMPAREFUNC_LEQUAL		0x4
+#define COMPAREFUNC_GREATER		0x5
+#define COMPAREFUNC_NOTEQUAL		0x6
+#define COMPAREFUNC_GEQUAL		0x7
+
+/* Samplers aren't strictly wm state from the hardware's perspective,
+ * but that is the only situation in which we use them in this driver.
+ */
+
+static int intel_translate_shadow_compare_func(unsigned func)
+{
+   switch(func) {
+   case PIPE_FUNC_NEVER:
+       return COMPAREFUNC_ALWAYS;
+   case PIPE_FUNC_LESS:
+       return COMPAREFUNC_LEQUAL;
+   case PIPE_FUNC_LEQUAL:
+       return COMPAREFUNC_LESS;
+   case PIPE_FUNC_GREATER:
+       return COMPAREFUNC_GEQUAL;
+   case PIPE_FUNC_GEQUAL:
+      return COMPAREFUNC_GREATER;
+   case PIPE_FUNC_NOTEQUAL:
+      return COMPAREFUNC_EQUAL;
+   case PIPE_FUNC_EQUAL:
+      return COMPAREFUNC_NOTEQUAL;
+   case PIPE_FUNC_ALWAYS:
+       return COMPAREFUNC_NEVER;
+   }
+
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
+   return COMPAREFUNC_NEVER;
+}
+
+/* The brw (and related graphics cores) do not support GL_CLAMP.  The
+ * Intel drivers for "other operating systems" implement GL_CLAMP as
+ * GL_CLAMP_TO_EDGE, so the same is done here.
+ */
+static unsigned translate_wrap_mode( int wrap )
+{
+   switch( wrap ) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return BRW_TEXCOORDMODE_WRAP;
+   case PIPE_TEX_WRAP_CLAMP:
+      return BRW_TEXCOORDMODE_CLAMP;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return BRW_TEXCOORDMODE_CLAMP; /* conform likes it this way */
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return BRW_TEXCOORDMODE_CLAMP_BORDER;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return BRW_TEXCOORDMODE_MIRROR;
+   default:
+      return BRW_TEXCOORDMODE_WRAP;
+   }
+}
+
+
+static unsigned U_FIXED(float value, unsigned frac_bits)
+{
+   value *= (1<<frac_bits);
+   return value < 0 ? 0 : value;
+}
+
+static int S_FIXED(float value, unsigned frac_bits)
+{
+   return value * (1<<frac_bits);
+}
+
+
+static unsigned upload_default_color( struct brw_context *brw,
+                                      const float *color )
+{
+   struct brw_sampler_default_color sdc;
+
+   COPY_4V(sdc.color, color);
+
+   return brw_cache_data( &brw->cache[BRW_SAMPLER_DEFAULT_COLOR], &sdc );
+}
+
+
+/*
+ */
+static void brw_update_sampler_state( const struct pipe_sampler_state *pipe_sampler,
+				      unsigned sdc_gs_offset,
+				      struct brw_sampler_state *sampler)
+{
+   memset(sampler, 0, sizeof(*sampler));
+
+   switch (pipe_sampler->min_mip_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC;
+      break;
+   default:
+      break;
+   }
+
+   switch (pipe_sampler->min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
+      break;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+      break;
+   default:
+      break;
+   }
+   /* Set Anisotropy:
+    */
+   switch (pipe_sampler->mag_img_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+      sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+      break;
+   default:
+      break;
+   }
+
+   if (pipe_sampler->max_anisotropy > 2.0) {
+      sampler->ss3.max_aniso = MAX2((pipe_sampler->max_anisotropy - 2) / 2,
+                                    BRW_ANISORATIO_16);
+   }
+
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_s);
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_r);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_t);
+
+   /* Fulsim complains if I don't do this.  Hardware doesn't mind:
+    */
+#if 0
+   if (texObj->Target == GL_TEXTURE_CUBE_MAP_ARB) {
+      sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+      sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+   }
+#endif
+
+   /* Set shadow function:
+    */
+   if (pipe_sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+      /* Shadowing is "enabled" by emitting a particular sampler
+       * message (sample_c).  So need to recompile WM program when
+       * shadow comparison is enabled on each/any texture unit.
+       */
+      sampler->ss0.shadow_function = intel_translate_shadow_compare_func(pipe_sampler->compare_func);
+   }
+
+   /* Set LOD bias:
+    */
+   sampler->ss0.lod_bias = S_FIXED(CLAMP(pipe_sampler->lod_bias, -16, 15), 6);
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD:
+    *
+    * XXX: I don't think that using firstLevel, lastLevel works,
+    * because we always setup the surface state as if firstLevel ==
+    * level zero.  Probably have to subtract firstLevel from each of
+    * these:
+    */
+   sampler->ss0.base_level = U_FIXED(0, 1);
+
+   sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(pipe_sampler->max_lod, 0), 13), 6);
+   sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(pipe_sampler->min_lod, 0), 13), 6);
+
+   sampler->ss2.default_color_pointer = sdc_gs_offset >> 5;
+}
+
+
+
+/* All samplers must be uploaded in a single contiguous array, which
+ * complicates various things.  However, this is still too confusing -
+ * FIXME: simplify all the different new texture state flags.
+ */
+static void upload_wm_samplers(struct brw_context *brw)
+{
+   unsigned unit;
+   unsigned sampler_count = 0;
+
+   /* BRW_NEW_SAMPLER */
+   for (unit = 0; unit < brw->num_textures && unit < brw->num_samplers;
+        unit++) {
+      /* determine unit enable/disable by looking for a bound texture */
+      if (brw->attribs.Texture[unit]) {
+         const struct pipe_sampler_state *sampler = brw->attribs.Samplers[unit];
+	 unsigned sdc_gs_offset = upload_default_color(brw, sampler->border_color);
+
+	 brw_update_sampler_state(sampler,
+				  sdc_gs_offset,
+				  &brw->wm.sampler[unit]);
+
+	 sampler_count = unit + 1;
+      }
+   }
+
+   if (brw->wm.sampler_count != sampler_count) {
+      brw->wm.sampler_count = sampler_count;
+      brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
+   }
+
+   brw->wm.sampler_gs_offset = 0;
+
+   if (brw->wm.sampler_count)
+      brw->wm.sampler_gs_offset =
+	 brw_cache_data_sz(&brw->cache[BRW_SAMPLER],
+			   brw->wm.sampler,
+			   sizeof(struct brw_sampler_state) * brw->wm.sampler_count);
+}
+
+const struct brw_tracked_state brw_wm_samplers = {
+   .dirty = {
+      .brw = BRW_NEW_SAMPLER,
+      .cache = 0
+   },
+   .update = upload_wm_samplers
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_wm_state.c b/src/gallium/drivers/i965simple/brw_wm_state.c
new file mode 100644
index 0000000000..37a9bf919c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_state.c
@@ -0,0 +1,195 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_wm.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+/***********************************************************************
+ * WM unit - fragment programs and rasterization
+ */
+static void upload_wm_unit(struct brw_context *brw )
+{
+   struct brw_wm_unit_state wm;
+   unsigned max_threads;
+   unsigned per_thread;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      max_threads = 0;
+   else
+      max_threads = 31;
+
+
+   memset(&wm, 0, sizeof(wm));
+
+   /* CACHE_NEW_WM_PROG */
+   wm.thread0.grf_reg_count = align(brw->wm.prog_data->total_grf, 16) / 16 - 1;
+   wm.thread0.kernel_start_pointer = brw->wm.prog_gs_offset >> 6;
+   wm.thread3.dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
+   wm.thread3.urb_entry_read_length = brw->wm.prog_data->urb_read_length;
+   wm.thread3.const_urb_entry_read_length = brw->wm.prog_data->curb_read_length;
+
+   wm.wm5.max_threads = max_threads;
+
+   per_thread = align(brw->wm.prog_data->total_scratch, 1024);
+   assert(per_thread <= 12 * 1024);
+
+#if 0
+   if (brw->wm.prog_data->total_scratch) {
+      unsigned total = per_thread * (max_threads + 1);
+
+      /* Scratch space -- just have to make sure there is sufficient
+       * allocated for the active program and current number of threads.
+       */
+      brw->wm.scratch_buffer_size = total;
+      if (brw->wm.scratch_buffer &&
+	  brw->wm.scratch_buffer_size > brw->wm.scratch_buffer->size) {
+	 dri_bo_unreference(brw->wm.scratch_buffer);
+	 brw->wm.scratch_buffer = NULL;
+      }
+      if (!brw->wm.scratch_buffer) {
+	 brw->wm.scratch_buffer = dri_bo_alloc(intel->intelScreen->bufmgr,
+					       "wm scratch",
+					       brw->wm.scratch_buffer_size,
+					       4096, DRM_BO_FLAG_MEM_TT);
+      }
+   }
+   /* XXX: Scratch buffers are not implemented correectly.
+    *
+    * The scratch offset to be programmed into wm is relative to the general
+    * state base address.  However, using dri_bo_alloc/dri_bo_emit_reloc (or
+    * the previous bmGenBuffers scheme), we get an offset relative to the
+    * start of framebuffer.  Even before then, it was broken in other ways,
+    * so just fail for now if we hit that path.
+    */
+   assert(brw->wm.prog_data->total_scratch == 0);
+#endif
+
+   /* CACHE_NEW_SURFACE */
+   wm.thread1.binding_table_entry_count = brw->wm.nr_surfaces;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   wm.thread3.const_urb_entry_read_offset = brw->curbe.wm_start * 2;
+
+   wm.thread3.urb_entry_read_offset = 0;
+   wm.thread1.depth_coef_urb_read_offset = 1;
+   wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   /* CACHE_NEW_SAMPLER */
+   wm.wm4.sampler_count = (brw->wm.sampler_count + 1) / 4;
+   wm.wm4.sampler_state_pointer = brw->wm.sampler_gs_offset >> 5;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   {
+      const struct brw_fragment_program *fp = brw->attribs.FragmentProgram;
+
+      if (fp->UsesDepth)
+	 wm.wm5.program_uses_depth = 1; /* as far as we can tell */
+
+      if (fp->info.writes_z)
+	 wm.wm5.program_computes_depth = 1;
+
+      /* BRW_NEW_ALPHA_TEST */
+      if (fp->info.uses_kill ||
+	  brw->attribs.DepthStencil->alpha.enabled)
+	 wm.wm5.program_uses_killpixel = 1;
+
+      wm.wm5.enable_8_pix = 1;
+   }
+
+   wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
+   wm.wm5.legacy_line_rast = 0;
+   wm.wm5.legacy_global_depth_bias = 0;
+   wm.wm5.early_depth_test = 1;	        /* never need to disable */
+   wm.wm5.line_aa_region_width = 0;
+   wm.wm5.line_endcap_aa_region_width = 1;
+
+   /* BRW_NEW_RASTERIZER */
+   if (brw->attribs.Raster->poly_stipple_enable)
+      wm.wm5.polygon_stipple = 1;
+
+#if 0
+   if (brw->attribs.Polygon->OffsetFill) {
+      wm.wm5.depth_offset = 1;
+      /* Something wierd going on with legacy_global_depth_bias,
+       * offset_constant, scaling and MRD.  This value passes glean
+       * but gives some odd results elsewere (eg. the
+       * quad-offset-units test).
+       */
+      wm.global_depth_offset_constant = brw->attribs.Polygon->OffsetUnits * 2;
+
+      /* This is the only value that passes glean:
+       */
+      wm.global_depth_offset_scale = brw->attribs.Polygon->OffsetFactor;
+   }
+#endif
+
+   if (brw->attribs.Raster->line_stipple_enable) {
+      wm.wm5.line_stipple = 1;
+   }
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      wm.wm4.stats_enable = 1;
+
+   brw->wm.state_gs_offset = brw_cache_data( &brw->cache[BRW_WM_UNIT], &wm );
+
+   if (brw->wm.prog_data->total_scratch) {
+      /*
+      dri_emit_reloc(brw->cache[BRW_WM_UNIT].pool->buffer,
+		     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
+		     (per_thread / 1024) - 1,
+		     brw->wm.state_gs_offset +
+		     ((char *)&wm.thread2 - (char *)&wm),
+		     brw->wm.scratch_buffer);
+      */
+   } else {
+      wm.thread2.scratch_space_base_pointer = 0;
+   }
+}
+
+const struct brw_tracked_state brw_wm_unit = {
+   .dirty = {
+      .brw = (BRW_NEW_RASTERIZER |
+	      BRW_NEW_ALPHA_TEST |
+	      BRW_NEW_FS |
+	      BRW_NEW_CURBE_OFFSETS),
+
+      .cache = (CACHE_NEW_SURFACE |
+		CACHE_NEW_WM_PROG |
+		CACHE_NEW_SAMPLER)
+   },
+   .update = upload_wm_unit
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_wm_surface_state.c b/src/gallium/drivers/i965simple/brw_wm_surface_state.c
new file mode 100644
index 0000000000..1bab5bfdb3
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_surface_state.c
@@ -0,0 +1,305 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+static unsigned translate_tex_target( enum pipe_texture_target target )
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D:
+      return BRW_SURFACE_1D;
+
+   case PIPE_TEXTURE_2D:
+      return BRW_SURFACE_2D;
+
+   case PIPE_TEXTURE_3D:
+      return BRW_SURFACE_3D;
+
+   case PIPE_TEXTURE_CUBE:
+      return BRW_SURFACE_CUBE;
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned translate_tex_format( enum pipe_format pipe_format )
+{
+   switch( pipe_format ) {
+   case PIPE_FORMAT_L8_UNORM:
+      return BRW_SURFACEFORMAT_L8_UNORM;
+
+   case PIPE_FORMAT_I8_UNORM:
+      return BRW_SURFACEFORMAT_I8_UNORM;
+
+   case PIPE_FORMAT_A8_UNORM:
+      return BRW_SURFACEFORMAT_A8_UNORM;
+
+   case PIPE_FORMAT_A8L8_UNORM:
+      return BRW_SURFACEFORMAT_L8A8_UNORM;
+
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      assert(0);		/* not supported for sampling */
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return BRW_SURFACEFORMAT_B5G6R5_UNORM;
+
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      return BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      return BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+
+   case PIPE_FORMAT_YCBCR_REV:
+      return BRW_SURFACEFORMAT_YCRCB_NORMAL;
+
+   case PIPE_FORMAT_YCBCR:
+      return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
+#if 0
+   case PIPE_FORMAT_RGB_FXT1:
+   case PIPE_FORMAT_RGBA_FXT1:
+      return BRW_SURFACEFORMAT_FXT1;
+#endif
+
+   case PIPE_FORMAT_Z16_UNORM:
+      return BRW_SURFACEFORMAT_I16_UNORM;
+#if 0
+   case PIPE_FORMAT_RGB_DXT1:
+       return BRW_SURFACEFORMAT_DXT1_RGB;
+
+   case PIPE_FORMAT_RGBA_DXT1:
+       return BRW_SURFACEFORMAT_BC1_UNORM;
+
+   case PIPE_FORMAT_RGBA_DXT3:
+       return BRW_SURFACEFORMAT_BC2_UNORM;
+
+   case PIPE_FORMAT_RGBA_DXT5:
+       return BRW_SURFACEFORMAT_BC3_UNORM;
+
+   case PIPE_FORMAT_SRGBA8:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
+   case PIPE_FORMAT_SRGB_DXT1:
+      return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
+#endif
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned brw_buffer_offset(struct brw_context *brw,
+                                  struct pipe_buffer *buffer)
+{
+   return brw->winsys->get_buffer_offset(brw->winsys,
+                                         buffer,
+                                         0);
+}
+
+static
+void brw_update_texture_surface( struct brw_context *brw,
+				 unsigned unit )
+{
+   const struct brw_texture *tObj = brw->attribs.Texture[unit];
+   struct brw_surface_state surf;
+
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   surf.ss0.surface_type = translate_tex_target(tObj->base.target);
+   surf.ss0.surface_format = translate_tex_format(tObj->base.format);
+
+   /* This is ok for all textures with channel width 8bit or less:
+    */
+/*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
+
+   /* Updated in emit_reloc */
+   surf.ss1.base_addr = brw_buffer_offset( brw, tObj->buffer );
+
+   surf.ss2.mip_count = tObj->base.last_level;
+   surf.ss2.width = tObj->base.width[0] - 1;
+   surf.ss2.height = tObj->base.height[0] - 1;
+
+   surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+   surf.ss3.tiled_surface = 0; /* always zero */
+   surf.ss3.pitch = tObj->stride - 1;
+   surf.ss3.depth = tObj->base.depth[0] - 1;
+
+   surf.ss4.min_lod = 0;
+
+   if (tObj->base.target == PIPE_TEXTURE_CUBE) {
+      surf.ss0.cube_pos_x = 1;
+      surf.ss0.cube_pos_y = 1;
+      surf.ss0.cube_pos_z = 1;
+      surf.ss0.cube_neg_x = 1;
+      surf.ss0.cube_neg_y = 1;
+      surf.ss0.cube_neg_z = 1;
+   }
+
+   brw->wm.bind.surf_ss_offset[unit + 1] =
+      brw_cache_data( &brw->cache[BRW_SS_SURFACE], &surf );
+}
+
+
+
+#define OFFSET(TYPE, FIELD) ( (unsigned)&(((TYPE *)0)->FIELD) )
+
+
+static void upload_wm_surfaces(struct brw_context *brw )
+{
+   unsigned i;
+
+   {
+      struct brw_surface_state surf;
+
+      /* BRW_NEW_FRAMEBUFFER
+       */
+      struct pipe_surface *pipe_surface = brw->attribs.FrameBuffer.cbufs[0];/*fixme*/
+      struct brw_texture *tex = (struct brw_texture *)pipe_surface->texture;
+
+      memset(&surf, 0, sizeof(surf));
+
+      if (pipe_surface != NULL) {
+	 if (pipe_surface->block.size == 4)
+	    surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 else
+	    surf.ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+
+	 surf.ss0.surface_type = BRW_SURFACE_2D;
+
+	 surf.ss1.base_addr = brw_buffer_offset( brw, tex->buffer );
+
+	 surf.ss2.width = pipe_surface->width - 1;
+	 surf.ss2.height = pipe_surface->height - 1;
+	 surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+	 surf.ss3.tiled_surface = 0;
+	 surf.ss3.pitch = pipe_surface->stride - 1;
+      } else {
+	 surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 surf.ss0.surface_type = BRW_SURFACE_NULL;
+      }
+
+      /* BRW_NEW_BLEND */
+      surf.ss0.color_blend = (!brw->attribs.Blend->logicop_enable &&
+			      brw->attribs.Blend->blend_enable);
+
+
+      surf.ss0.writedisable_red =   !(brw->attribs.Blend->colormask & PIPE_MASK_R);
+      surf.ss0.writedisable_green = !(brw->attribs.Blend->colormask & PIPE_MASK_G);
+      surf.ss0.writedisable_blue =  !(brw->attribs.Blend->colormask & PIPE_MASK_B);
+      surf.ss0.writedisable_alpha = !(brw->attribs.Blend->colormask & PIPE_MASK_A);
+
+
+
+
+      brw->wm.bind.surf_ss_offset[0] = brw_cache_data( &brw->cache[BRW_SS_SURFACE], &surf );
+
+      brw->wm.nr_surfaces = 1;
+   }
+
+
+   /* BRW_NEW_TEXTURE
+    */
+   for (i = 0; i < brw->num_textures && i < brw->num_samplers; i++) {
+      const struct brw_texture *texUnit = brw->attribs.Texture[i];
+
+      if (texUnit &&
+	  texUnit->base.refcount/*(texUnit->refcount > 0) == really used */) {
+
+	 brw_update_texture_surface(brw, i);
+
+	 brw->wm.nr_surfaces = i+2;
+      }
+      else {
+	 brw->wm.bind.surf_ss_offset[i+1] = 0;
+      }
+   }
+
+   brw->wm.bind_ss_offset = brw_cache_data( &brw->cache[BRW_SS_SURF_BIND],
+					    &brw->wm.bind );
+}
+
+
+/* KW: Will find a different way to acheive this, see for example the
+ * state caches with relocs in the i915 swz driver.
+ */
+#if 0
+static void emit_reloc_wm_surfaces(struct brw_context *brw)
+{
+   int unit;
+
+   if (brw->state.draw_region != NULL) {
+      /* Emit framebuffer relocation */
+      dri_emit_reloc(brw_cache_buffer(brw, BRW_SS_SURFACE),
+		     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
+		     0,
+		     brw->wm.bind.surf_ss_offset[0] +
+		     offsetof(struct brw_surface_state, ss1),
+		     brw->state.draw_region->buffer);
+   }
+
+   /* Emit relocations for texture buffers */
+   for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
+      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[unit];
+      struct gl_texture_object *tObj = texUnit->_Current;
+      struct intel_texture_object *intelObj = intel_texture_object(tObj);
+
+      if (texUnit->_ReallyEnabled && intelObj->mt != NULL) {
+	 dri_emit_reloc(brw_cache_buffer(brw, BRW_SS_SURFACE),
+			DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+			0,
+			brw->wm.bind.surf_ss_offset[unit + 1] +
+			offsetof(struct brw_surface_state, ss1),
+			intelObj->mt->region->buffer);
+      }
+   }
+}
+#endif
+
+const struct brw_tracked_state brw_wm_surfaces = {
+   .dirty = {
+      .brw = (BRW_NEW_FRAMEBUFFER |
+	      BRW_NEW_BLEND |
+	      BRW_NEW_TEXTURE),
+      .cache = 0
+   },
+   .update = upload_wm_surfaces,
+};
diff --git a/src/gallium/drivers/nouveau/nouveau_gldefs.h b/src/gallium/drivers/nouveau/nouveau_gldefs.h
new file mode 100644
index 0000000000..ff97aaa9af
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_gldefs.h
@@ -0,0 +1,196 @@
+#ifndef __NOUVEAU_GLDEFS_H__
+#define __NOUVEAU_GLDEFS_H__
+
+static INLINE unsigned
+nvgl_blend_func(unsigned factor)
+{
+	switch (factor) {
+	case PIPE_BLENDFACTOR_ZERO:
+		return 0x0000;
+	case PIPE_BLENDFACTOR_ONE:
+		return 0x0001;
+	case PIPE_BLENDFACTOR_SRC_COLOR:
+		return 0x0300;
+	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+		return 0x0301;
+	case PIPE_BLENDFACTOR_SRC_ALPHA:
+		return 0x0302;
+	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+		return 0x0303;
+	case PIPE_BLENDFACTOR_DST_ALPHA:
+		return 0x0304;
+	case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+		return 0x0305;
+	case PIPE_BLENDFACTOR_DST_COLOR:
+		return 0x0306;
+	case PIPE_BLENDFACTOR_INV_DST_COLOR:
+		return 0x0307;
+	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+		return 0x0308;
+	case PIPE_BLENDFACTOR_CONST_COLOR:
+		return 0x8001;
+	case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+		return 0x8002;
+	case PIPE_BLENDFACTOR_CONST_ALPHA:
+		return 0x8003;
+	case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+		return 0x8004;
+	default:
+		return 0x0000;
+	}
+}
+
+static INLINE unsigned
+nvgl_blend_eqn(unsigned func)
+{
+	switch (func) {
+	case PIPE_BLEND_ADD:
+		return 0x8006;
+	case PIPE_BLEND_MIN:
+		return 0x8007;
+	case PIPE_BLEND_MAX:
+		return 0x8008;
+	case PIPE_BLEND_SUBTRACT:
+		return 0x800a;
+	case PIPE_BLEND_REVERSE_SUBTRACT:
+		return 0x800b;
+	default:
+		return 0x8006;
+	}
+}
+
+static INLINE unsigned
+nvgl_logicop_func(unsigned func)
+{
+	switch (func) {
+	case PIPE_LOGICOP_CLEAR:
+		return 0x1500;
+	case PIPE_LOGICOP_NOR:
+		return 0x1508;
+	case PIPE_LOGICOP_AND_INVERTED:
+		return 0x1504;
+	case PIPE_LOGICOP_COPY_INVERTED:
+		return 0x150c;
+	case PIPE_LOGICOP_AND_REVERSE:
+		return 0x1502;
+	case PIPE_LOGICOP_INVERT:
+		return 0x150a;
+	case PIPE_LOGICOP_XOR:
+		return 0x1506;
+	case PIPE_LOGICOP_NAND:
+		return 0x150e;
+	case PIPE_LOGICOP_AND:
+		return 0x1501;
+	case PIPE_LOGICOP_EQUIV:
+		return 0x1509;
+	case PIPE_LOGICOP_NOOP:
+		return 0x1505;
+	case PIPE_LOGICOP_OR_INVERTED:
+		return 0x150d;
+	case PIPE_LOGICOP_COPY:
+		return 0x1503;
+	case PIPE_LOGICOP_OR_REVERSE:
+		return 0x150b;
+	case PIPE_LOGICOP_OR:
+		return 0x1507;
+	case PIPE_LOGICOP_SET:
+		return 0x150f;
+	default:
+		return 0x1505;
+	}
+}
+
+static INLINE unsigned
+nvgl_comparison_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_FUNC_NEVER:
+		return 0x0200;
+	case PIPE_FUNC_LESS:
+		return 0x0201;
+	case PIPE_FUNC_EQUAL:
+		return 0x0202;
+	case PIPE_FUNC_LEQUAL:
+		return 0x0203;
+	case PIPE_FUNC_GREATER:
+		return 0x0204;
+	case PIPE_FUNC_NOTEQUAL:
+		return 0x0205;
+	case PIPE_FUNC_GEQUAL:
+		return 0x0206;
+	case PIPE_FUNC_ALWAYS:
+		return 0x0207;
+	default:
+		return 0x0207;
+	}
+}
+
+static INLINE unsigned
+nvgl_polygon_mode(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_POLYGON_MODE_POINT:
+		return 0x1b00;
+	case PIPE_POLYGON_MODE_LINE:
+		return 0x1b01;
+	case PIPE_POLYGON_MODE_FILL:
+		return 0x1b02;
+	default:
+		return 0x1b02;
+	}
+}
+
+static INLINE unsigned
+nvgl_stencil_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_STENCIL_OP_ZERO:
+		return 0x0000;
+	case PIPE_STENCIL_OP_INVERT:
+		return 0x150a;
+	case PIPE_STENCIL_OP_KEEP:
+		return 0x1e00;
+	case PIPE_STENCIL_OP_REPLACE:
+		return 0x1e01;
+	case PIPE_STENCIL_OP_INCR:
+		return 0x1e02;
+	case PIPE_STENCIL_OP_DECR:
+		return 0x1e03;
+	case PIPE_STENCIL_OP_INCR_WRAP:
+		return 0x8507;
+	case PIPE_STENCIL_OP_DECR_WRAP:
+		return 0x8508;
+	default:
+		return 0x1e00;
+	}
+}
+
+static INLINE unsigned
+nvgl_primitive(unsigned prim) {
+	switch (prim) {
+	case PIPE_PRIM_POINTS:
+		return 0x0001;
+	case PIPE_PRIM_LINES:
+		return 0x0002;
+	case PIPE_PRIM_LINE_LOOP:
+		return 0x0003;
+	case PIPE_PRIM_LINE_STRIP:
+		return 0x0004;
+	case PIPE_PRIM_TRIANGLES:
+		return 0x0005;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return 0x0006;
+	case PIPE_PRIM_TRIANGLE_FAN:
+		return 0x0007;
+	case PIPE_PRIM_QUADS:
+		return 0x0008;
+	case PIPE_PRIM_QUAD_STRIP:
+		return 0x0009;
+	case PIPE_PRIM_POLYGON:
+		return 0x000a;
+	default:
+		return 0;
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_push.h b/src/gallium/drivers/nouveau/nouveau_push.h
new file mode 100644
index 0000000000..54ef1c1291
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_push.h
@@ -0,0 +1,82 @@
+#ifndef __NOUVEAU_PUSH_H__
+#define __NOUVEAU_PUSH_H__
+
+#include "nouveau/nouveau_winsys.h"
+
+#ifndef NOUVEAU_PUSH_CONTEXT
+#error undefined push context
+#endif
+
+#define OUT_RING(data) do {                                                    \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	(*pc->nvws->channel->pushbuf->cur++) = (data);                         \
+} while(0)
+
+#define OUT_RINGp(src,size) do {                                               \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	memcpy(pc->nvws->channel->pushbuf->cur, (src), (size) * 4);            \
+	pc->nvws->channel->pushbuf->cur += (size);                             \
+} while(0)
+
+#define OUT_RINGf(data) do {                                                   \
+	union { float v; uint32_t u; } c;                                      \
+	c.v = (data);                                                          \
+	OUT_RING(c.u);                                                         \
+} while(0)
+
+#define BEGIN_RING(obj,mthd,size) do {                                         \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+		pc->nvws->push_flush(pc->nvws, ((size) + 1), NULL);            \
+	OUT_RING((pc->obj->subc << 13) | ((size) << 18) | (mthd));             \
+	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
+#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
+	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
+} while(0)
+
+#define FIRE_RING(fence) do {                                                  \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	pc->nvws->push_flush(pc->nvws, 0, fence);                              \
+} while(0)
+
+#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	pc->nvws->push_reloc(pc->nvws, pc->nvws->channel->pushbuf->cur++,      \
+			     (bo), (data), (flags), (vor), (tor));             \
+} while(0)
+
+/* Raw data + flags depending on FB/TT buffer */
+#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
+	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
+} while(0)
+
+/* FB/TT object handle */
+#define OUT_RELOCo(bo,flags) do {                                              \
+	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
+		  pc->nvws->channel->vram->handle,                             \
+		  pc->nvws->channel->gart->handle);                            \
+} while(0)
+
+/* Low 32-bits of offset */
+#define OUT_RELOCl(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
+} while(0)
+
+/* High 32-bits of offset */
+#define OUT_RELOCh(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
+} while(0)
+
+/* A reloc which'll recombine into a NV_DMA_METHOD packet header */
+#define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+		pc->nvws->push_flush(pc->nvws->channel, ((size) + 1), NULL);   \
+	OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
+		   (flags), 0, 0);                                             \
+	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
new file mode 100644
index 0000000000..4ae4ff4940
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -0,0 +1,159 @@
+#ifndef __NOUVEAU_STATEOBJ_H__
+#define __NOUVEAU_STATEOBJ_H__
+
+#include "pipe/p_debug.h"
+
+struct nouveau_stateobj_reloc {
+	struct pipe_buffer *bo;
+
+	unsigned offset;
+	unsigned packet;
+
+	unsigned data;
+	unsigned flags;
+	unsigned vor;
+	unsigned tor;
+};
+
+struct nouveau_stateobj {
+	int refcount;
+
+	unsigned *push;
+	struct nouveau_stateobj_reloc *reloc;
+
+	unsigned *cur;
+	unsigned cur_packet;
+	unsigned cur_reloc;
+};
+
+static INLINE struct nouveau_stateobj *
+so_new(unsigned push, unsigned reloc)
+{
+	struct nouveau_stateobj *so;
+
+	so = MALLOC(sizeof(struct nouveau_stateobj));
+	so->refcount = 0;
+	so->push = MALLOC(sizeof(unsigned) * push);
+	so->reloc = MALLOC(sizeof(struct nouveau_stateobj_reloc) * reloc);
+
+	so->cur = so->push;
+	so->cur_reloc = so->cur_packet = 0;
+
+	return so;
+}
+
+static INLINE void
+so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
+{
+	struct nouveau_stateobj *so = *pso;
+
+	if (ref) {
+		ref->refcount++;
+	}
+
+	if (so && --so->refcount <= 0) {
+		free(so->push);
+		free(so->reloc);
+		free(so);
+	}
+
+	*pso = ref;
+}
+
+static INLINE void
+so_data(struct nouveau_stateobj *so, unsigned data)
+{
+	(*so->cur++) = (data);
+	so->cur_packet += 4;
+}
+
+static INLINE void
+so_datap(struct nouveau_stateobj *so, unsigned *data, unsigned size)
+{
+	so->cur_packet += (4 * size);
+	while (size--)
+		(*so->cur++) = (*data++);
+}
+
+static INLINE void
+so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
+	  unsigned mthd, unsigned size)
+{
+	so->cur_packet = (gr->subc << 13) | (1 << 18) | (mthd - 4);
+	so_data(so, (gr->subc << 13) | (size << 18) | mthd);
+}
+
+static INLINE void
+so_reloc(struct nouveau_stateobj *so, struct pipe_buffer *bo,
+	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	struct nouveau_stateobj_reloc *r = &so->reloc[so->cur_reloc++];
+	
+	r->bo = bo;
+	r->offset = so->cur - so->push;
+	r->packet = so->cur_packet;
+	r->data = data;
+	r->flags = flags;
+	r->vor = vor;
+	r->tor = tor;
+	so_data(so, data);
+}
+
+static INLINE void
+so_dump(struct nouveau_stateobj *so)
+{
+	unsigned i, nr = so->cur - so->push;
+
+	for (i = 0; i < nr; i++)
+		debug_printf("+0x%04x: 0x%08x\n", i, so->push[i]);
+}
+
+static INLINE void
+so_emit(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned nr, i;
+
+	nr = so->cur - so->push;
+	if (pb->remaining < nr)
+		nvws->push_flush(nvws, nr, NULL);
+	pb->remaining -= nr;
+
+	memcpy(pb->cur, so->push, nr * 4);
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws, pb->cur + r->offset, r->bo,
+				 r->data, r->flags, r->vor, r->tor);
+	}
+	pb->cur += nr;
+}
+
+static INLINE void
+so_emit_reloc_markers(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned i;
+
+	if (!so)
+		return;
+
+	i = so->cur_reloc << 1;
+	if (nvws->channel->pushbuf->remaining < i)
+		nvws->push_flush(nvws, i, NULL);
+	nvws->channel->pushbuf->remaining -= i;
+
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws, pb->cur++, r->bo, r->packet,
+				 (r->flags & (NOUVEAU_BO_VRAM |
+					      NOUVEAU_BO_GART |
+					      NOUVEAU_BO_RDWR)) |
+				 NOUVEAU_BO_DUMMY, 0, 0);
+		nvws->push_reloc(nvws, pb->cur++, r->bo, r->data,
+				 r->flags | NOUVEAU_BO_DUMMY, r->vor, r->tor);
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_util.h b/src/gallium/drivers/nouveau/nouveau_util.h
new file mode 100644
index 0000000000..a10114beab
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_util.h
@@ -0,0 +1,91 @@
+#ifndef __NOUVEAU_UTIL_H__
+#define __NOUVEAU_UTIL_H__
+
+/* Determine how many vertices can be pushed into the command stream.
+ * Where the remaining space isn't large enough to represent all verices,
+ * split the buffer at primitive boundaries.
+ *
+ * Returns a count of vertices that can be rendered, and an index to
+ * restart drawing at after a flush.
+ */
+static INLINE unsigned
+nouveau_vbuf_split(unsigned remaining, unsigned overhead, unsigned vpp,
+		   unsigned mode, unsigned start, unsigned count,
+		   unsigned *restart)
+{
+	int max, adj = 0;
+
+	max  = remaining - overhead;
+	if (max < 0)
+		return 0;
+
+	max *= vpp;
+	if (max >= count)
+		return count;
+
+	switch (mode) {
+	case PIPE_PRIM_POINTS:
+		break;
+	case PIPE_PRIM_LINES:
+		max = max & 1;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		max = max - (max % 3);
+		break;
+	case PIPE_PRIM_QUADS:
+		max = max & 3;
+		break;
+	case PIPE_PRIM_LINE_LOOP:
+	case PIPE_PRIM_LINE_STRIP:
+		if (max < 2)
+			max = 0;
+		adj = 1;
+		break;
+	case PIPE_PRIM_POLYGON:
+	case PIPE_PRIM_TRIANGLE_STRIP:
+	case PIPE_PRIM_TRIANGLE_FAN:
+		if (max < 3)
+			max = 0;
+		adj = 2;
+		break;
+	case PIPE_PRIM_QUAD_STRIP:
+		if (max < 4)
+			max = 0;
+		adj = 3;
+		break;
+	default:
+		assert(0);
+	}
+
+	*restart = start + max - adj;
+	return max;
+}
+
+/* Integer base-2 logarithm, rounded towards zero. */
+static INLINE unsigned log2i(unsigned i)
+{
+	unsigned r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
new file mode 100644
index 0000000000..b86c4b9338
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -0,0 +1,100 @@
+#ifndef NOUVEAU_WINSYS_H
+#define NOUVEAU_WINSYS_H
+
+#include <stdint.h>
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_defines.h"
+
+#include "nouveau/nouveau_bo.h"
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_class.h"
+#include "nouveau/nouveau_device.h"
+#include "nouveau/nouveau_grobj.h"
+#include "nouveau/nouveau_notifier.h"
+#include "nouveau/nouveau_resource.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+#define NOUVEAU_CAP_HW_VTXBUF (0xbeef0000)
+#define NOUVEAU_CAP_HW_IDXBUF (0xbeef0001)
+
+#define NOUVEAU_TEXTURE_USAGE_LINEAR (1 << 16)
+
+#define NOUVEAU_BUFFER_USAGE_TEXTURE (1 << 16)
+#define NOUVEAU_BUFFER_USAGE_ZETA    (1 << 17)
+
+struct nouveau_winsys {
+	struct nouveau_context *nv;
+
+	struct nouveau_channel *channel;
+
+	int  (*res_init)(struct nouveau_resource **heap, unsigned start,
+			 unsigned size);
+	int  (*res_alloc)(struct nouveau_resource *heap, int size, void *priv,
+			  struct nouveau_resource **);
+	void (*res_free)(struct nouveau_resource **);
+
+	int  (*push_reloc)(struct nouveau_winsys *, void *ptr,
+			   struct pipe_buffer *, uint32_t data,
+			   uint32_t flags, uint32_t vor, uint32_t tor);
+	int  (*push_flush)(struct nouveau_winsys *, unsigned size,
+			   struct pipe_fence_handle **fence);
+			       
+	int       (*grobj_alloc)(struct nouveau_winsys *, int grclass,
+				 struct nouveau_grobj **);
+	void      (*grobj_free)(struct nouveau_grobj **);
+
+	int       (*notifier_alloc)(struct nouveau_winsys *, int count,
+				    struct nouveau_notifier **);
+	void      (*notifier_free)(struct nouveau_notifier **);
+	void      (*notifier_reset)(struct nouveau_notifier *, int id);
+	uint32_t  (*notifier_status)(struct nouveau_notifier *, int id);
+	uint32_t  (*notifier_retval)(struct nouveau_notifier *, int id);
+	int       (*notifier_wait)(struct nouveau_notifier *, int id,
+				   int status, double timeout);
+
+	int (*surface_copy)(struct nouveau_winsys *, struct pipe_surface *,
+			    unsigned, unsigned, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned);
+	int (*surface_fill)(struct nouveau_winsys *, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned, unsigned);
+
+	struct nouveau_bo *(*get_bo)(struct pipe_buffer *);
+};
+
+extern struct pipe_screen *
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv04_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv10_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv20_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv30_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv40_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv50_create(struct pipe_screen *, unsigned pctx_id);
+
+#endif
diff --git a/src/gallium/drivers/nv04/Makefile b/src/gallium/drivers/nv04/Makefile
new file mode 100644
index 0000000000..4ed62dae95
--- /dev/null
+++ b/src/gallium/drivers/nv04/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv04
+
+DRIVER_SOURCES = \
+	nv04_surface_2d.c \
+	nv04_clear.c \
+	nv04_context.c \
+	nv04_fragprog.c \
+	nv04_fragtex.c \
+	nv04_miptree.c \
+	nv04_prim_vbuf.c \
+	nv04_screen.c \
+	nv04_state.c \
+	nv04_state_emit.c \
+	nv04_surface.c \
+	nv04_vbo.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv04/nv04_clear.c b/src/gallium/drivers/nv04/nv04_clear.c
new file mode 100644
index 0000000000..01cacd36fe
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv04_context.h"
+
+void
+nv04_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv04/nv04_context.c b/src/gallium/drivers/nv04/nv04_context.c
new file mode 100644
index 0000000000..d6710cd892
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_context.c
@@ -0,0 +1,107 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static void
+nv04_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_flush(nv04->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv04_destroy(struct pipe_context *pipe)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	if (nv04->draw)
+		draw_destroy(nv04->draw);
+
+	FREE(nv04);
+}
+
+static void
+nv04_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+static boolean
+nv04_init_hwctx(struct nv04_context *nv04)
+{
+	// requires a valid handle
+//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOTIFY, 1);
+//	OUT_RING(0);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOP, 1);
+	OUT_RING(0);
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(0x40182800);
+//	OUT_RING(1<<20/*no cull*/);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+//	OUT_RING(0x24|(1<<6)|(1<<8));
+	OUT_RING(0x120001a4);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FORMAT, 1);
+	OUT_RING(0x332213a1);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FILTER, 1);
+	OUT_RING(0x11001010);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_COLORKEY, 1);
+	OUT_RING(0x0);
+//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 1);
+//	OUT_RING(SCREEN_OFFSET);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR, 1);
+	OUT_RING(0xff000000);
+
+
+
+	FIRE_RING (NULL);
+	return TRUE;
+}
+
+struct pipe_context *
+nv04_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv04_screen *screen = nv04_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv04_context *nv04;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv04 = CALLOC(1, sizeof(struct nv04_context));
+	if (!nv04)
+		return NULL;
+	nv04->screen = screen;
+	nv04->pctx_id = pctx_id;
+
+	nv04->nvws = nvws;
+
+	nv04->pipe.winsys = ws;
+	nv04->pipe.screen = pscreen;
+	nv04->pipe.destroy = nv04_destroy;
+	nv04->pipe.set_edgeflags = nv04_set_edgeflags;
+	nv04->pipe.draw_arrays = nv04_draw_arrays;
+	nv04->pipe.draw_elements = nv04_draw_elements;
+	nv04->pipe.clear = nv04_clear;
+	nv04->pipe.flush = nv04_flush;
+
+	nv04_init_surface_functions(nv04);
+	nv04_init_state_functions(nv04);
+
+	nv04->draw = draw_create();
+	assert(nv04->draw);
+	draw_wide_point_threshold(nv04->draw, 0.0);
+	draw_wide_line_threshold(nv04->draw, 0.0);
+	draw_enable_line_stipple(nv04->draw, FALSE);
+	draw_enable_point_sprites(nv04->draw, FALSE);
+	draw_set_rasterize_stage(nv04->draw, nv04_draw_vbuf_stage(nv04));
+
+	nv04_init_hwctx(nv04);
+
+	return &nv04->pipe;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_context.h b/src/gallium/drivers/nv04/nv04_context.h
new file mode 100644
index 0000000000..2842b2c90d
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_context.h
@@ -0,0 +1,151 @@
+#ifndef __NV04_CONTEXT_H__
+#define __NV04_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv04_screen *ctx = nv04->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv04_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#include "nv04_screen.h"
+
+#define NV04_NEW_VERTPROG	(1 << 1)
+#define NV04_NEW_FRAGPROG	(1 << 2)
+#define NV04_NEW_BLEND		(1 << 3)
+#define NV04_NEW_RAST		(1 << 4)
+#define NV04_NEW_CONTROL	(1 << 5)
+#define NV04_NEW_VIEWPORT	(1 << 6)
+#define NV04_NEW_SAMPLER	(1 << 7)
+#define NV04_NEW_FRAMEBUFFER	(1 << 8)
+#define NV04_NEW_VTXARRAYS	(1 << 9)
+
+struct nv04_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv04_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	int chipset;
+	struct nouveau_notifier *sync;
+
+	uint32_t dirty;
+
+	struct nv04_blend_state *blend;
+	struct nv04_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+	struct nv04_fragtex_state fragtex;
+	struct nv04_rasterizer_state *rast;
+	struct nv04_depth_stencil_alpha_state *dsa;
+
+	struct nv04_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_framebuffer_state *framebuffer;
+	struct pipe_surface *rt;
+	struct pipe_surface *zeta;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[16];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv04_vertex_program *active;
+
+		struct nv04_vertex_program *current;
+		struct pipe_buffer *constant_buf;
+	} vertprog;
+
+	struct {
+		struct nv04_fragment_program *active;
+
+		struct nv04_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+
+	struct pipe_viewport_state viewport;
+};
+
+static INLINE struct nv04_context *
+nv04_context(struct pipe_context *pipe)
+{
+	return (struct nv04_context *)pipe;
+}
+
+extern void nv04_init_state_functions(struct nv04_context *nv04);
+extern void nv04_init_surface_functions(struct nv04_context *nv04);
+extern void nv04_screen_init_miptree_functions(struct pipe_screen *screen);
+
+/* nv04_clear.c */
+extern void nv04_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv04_draw.c */
+extern struct draw_stage *nv04_draw_render_stage(struct nv04_context *nv04);
+
+/* nv04_fragprog.c */
+extern void nv04_fragprog_bind(struct nv04_context *,
+			       struct nv04_fragment_program *);
+extern void nv04_fragprog_destroy(struct nv04_context *,
+				  struct nv04_fragment_program *);
+
+/* nv04_fragtex.c */
+extern void nv04_fragtex_bind(struct nv04_context *);
+
+/* nv04_prim_vbuf.c */
+struct draw_stage *nv04_draw_vbuf_stage( struct nv04_context *nv04 );
+
+/* nv04_state.c and friends */
+extern void nv04_emit_hw_state(struct nv04_context *nv04);
+extern void nv04_state_tex_update(struct nv04_context *nv04);
+
+/* nv04_vbo.c */
+extern boolean nv04_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv04_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_fragprog.c b/src/gallium/drivers/nv04/nv04_fragprog.c
new file mode 100644
index 0000000000..8a2af41fe0
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv04_context.h"
+
+void
+nv04_fragprog_bind(struct nv04_context *nv04, struct nv04_fragment_program *fp)
+{
+}
+
+void
+nv04_fragprog_destroy(struct nv04_context *nv04,
+		      struct nv04_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_fragtex.c b/src/gallium/drivers/nv04/nv04_fragtex.c
new file mode 100644
index 0000000000..21f990fd53
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_fragtex.c
@@ -0,0 +1,73 @@
+#include "nv04_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  PIPE_FORMAT_##m,                                                             \
+  NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_##tf,                                               \
+}
+
+struct nv04_texture_format {
+	uint	pipe;
+	int     format;
+};
+
+static struct nv04_texture_format
+nv04_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(X8R8G8B8_UNORM, X8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM,       Y8      ),
+	_(A8_UNORM,       Y8      ),
+};
+
+static uint32_t
+nv04_fragtex_format(uint pipe_format)
+{
+	struct nv04_texture_format *tf = nv04_texture_formats;
+	int i;
+
+	for (i=0; i< sizeof(nv04_texture_formats)/sizeof(nv04_texture_formats[0]); i++) {
+		if (tf->pipe == pipe_format)
+			return tf->format;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return 0;
+}
+
+
+static void
+nv04_fragtex_build(struct nv04_context *nv04, int unit)
+{
+	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
+	struct pipe_texture *pt = &nv04mt->base;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_2D:
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	nv04->fragtex.format = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER 
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER
+		| nv04_fragtex_format(pt->format)
+		| ( (pt->last_level + 1) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT )
+		| ( log2i(pt->width[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT )
+		| ( log2i(pt->height[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT )
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE
+		;
+}
+
+
+void
+nv04_fragtex_bind(struct nv04_context *nv04)
+{
+	nv04_fragtex_build(nv04, 0);
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_miptree.c b/src/gallium/drivers/nv04/nv04_miptree.c
new file mode 100644
index 0000000000..993c5ef5dd
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_miptree.c
@@ -0,0 +1,177 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static void
+nv04_miptree_layout(struct nv04_miptree *nv04mt)
+{
+	struct pipe_texture *pt = &nv04mt->base;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l;
+
+	nr_faces = 1;
+
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+		
+		nv04mt->level[l].pitch = pt->width[0];
+		nv04mt->level[l].pitch = (nv04mt->level[l].pitch + 63) & ~63;
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+
+		nv04mt->level[l].image_offset = offset;
+		offset += nv04mt->level[l].pitch * pt->height[l];
+	}
+
+	nv04mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv04_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv04_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv04_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->shadow_tex = NULL;
+	mt->shadow_surface = NULL;
+
+	//mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
+	nv04_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL |
+						NOUVEAU_BUFFER_USAGE_TEXTURE,
+						mt->total_size);
+	if (!mt->buffer) {
+		printf("failed %d byte alloc\n",mt->total_size);
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv04_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv04_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv04_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static void
+nv04_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv04_miptree *mt = (struct nv04_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	if (mt->shadow_tex) {
+		assert(mt->shadow_surface);
+		pscreen->tex_surface_release(pscreen, &mt->shadow_surface);
+		nv04_miptree_release(pscreen, &mt->shadow_tex);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv04_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv04_miptree *nv04mt = (struct nv04_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv04mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	ps->offset = nv04mt->level[level].image_offset;
+
+	return ps;
+}
+
+static void
+nv04_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(ps);
+}
+
+void
+nv04_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv04_miptree_create;
+	pscreen->texture_blanket = nv04_miptree_blanket;
+	pscreen->texture_release = nv04_miptree_release;
+	pscreen->get_tex_surface = nv04_miptree_surface_new;
+	pscreen->tex_surface_release = nv04_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_prim_vbuf.c b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
new file mode 100644
index 0000000000..18a8872ae3
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
@@ -0,0 +1,309 @@
+
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_compiler.h"
+
+#include "draw/draw_vbuf.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+#define VERTEX_SIZE 40
+#define VERTEX_BUFFER_SIZE (4096*VERTEX_SIZE) // 4096 vertices of 40 bytes each
+
+/**
+ * Primitive renderer for nv04.
+ */
+struct nv04_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv04_context *nv04;   
+
+	/** Vertex buffer */
+	unsigned char* buffer;
+
+	/** Vertex size in bytes */
+	unsigned vertex_size;
+
+	/** Current primitive */
+	unsigned prim;
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv04_vbuf_render *
+nv04_vbuf_render( struct vbuf_render *render )
+{
+	assert(render);
+	return (struct nv04_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+nv04_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+	struct nv04_context *nv04 = nv04_render->nv04;
+	return &nv04->vertex_info;
+}
+
+
+static void *
+nv04_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	nv04_render->buffer = (unsigned char*) MALLOC(VERTEX_BUFFER_SIZE);
+	assert(!nv04_render->buffer);
+
+	return nv04_render->buffer;
+}
+
+
+static boolean 
+nv04_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	if (prim <= PIPE_PRIM_LINE_STRIP)
+		return FALSE;
+
+	nv04_render->prim = prim;
+	return TRUE;
+}
+
+static INLINE void nv04_2triangles(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA),49);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v4,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v5,8);
+	OUT_RING(0xFEDCBA);
+}
+
+static INLINE void nv04_1triangle(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD),25);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RING(0xFED);
+}
+
+static INLINE void nv04_1quad(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC),33);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
+	OUT_RING(0xFECEDC);
+}
+
+static void nv04_vbuf_render_triangles_elts(struct nv04_vbuf_render * render, const ushort * indices, uint nr_indices)
+{
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i;
+
+	for( i=0; i< nr_indices-5; i+=6)
+		nv04_2triangles(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2],
+				indices[i+3],
+				indices[i+4],
+				indices[i+5]
+			       );
+	if (i != nr_indices)
+	{
+		nv04_1triangle(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2]
+			       );
+		i+=3;
+	}
+	if (i != nr_indices)
+		NOUVEAU_ERR("Houston, we have lost some vertices\n");
+}
+
+static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	const uint32_t striptbl[]={0x321210,0x543432,0x765654,0x987876,0xBA9A98,0xDCBCBA,0xFEDEDC};
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i,j;
+
+	for(i = 0; i<nr_indices; i+=14) 
+	{
+		int numvert = MIN2(16, nr_indices - i);
+		int numtri = numvert - 2;
+		if (numvert<3)
+			break;
+
+		BEGIN_RING( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8 );
+		for(j = 0; j<numvert; j++)
+			OUT_RINGp( buffer + VERTEX_SIZE * indices [i+j], 8 );
+
+		BEGIN_RING_NI( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2 );
+		for(j = 0; j<numtri/2; j++ )
+			OUT_RING(striptbl[j]);
+		if (numtri%2)
+			OUT_RING(striptbl[numtri/2]&0xFFF);
+	}
+}
+
+static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	const uint32_t fantbl[]={0x320210,0x540430,0x760650,0x980870,0xBA0A90,0xDC0CB0,0xFE0ED0};
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i,j;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
+	OUT_RINGp(buffer + VERTEX_SIZE * indices[0], 8);
+
+	for(i = 1; i<nr_indices; i+=14)
+	{
+		int numvert=MIN2(15, nr_indices - i);
+		int numtri=numvert-2;
+		if (numvert < 3)
+			break;
+
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
+
+		for(j=0;j<numvert;j++)
+			OUT_RINGp( buffer + VERTEX_SIZE * indices[ i+j ], 8 );
+
+		BEGIN_RING_NI(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2);
+		for(j = 0; j<numtri/2; j++)
+			OUT_RING(fantbl[j]);
+		if (numtri%2)
+			OUT_RING(fantbl[numtri/2]&0xFFF);
+	}
+}
+
+static void nv04_vbuf_render_quads_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i;
+
+	for(i = 0; i < nr_indices; i += 4)
+		nv04_1quad(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2],
+				indices[i+3]
+			       );
+}
+
+
+static void 
+nv04_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	// emit the indices
+	switch( nv04_render->prim )
+	{
+		case PIPE_PRIM_TRIANGLES:
+			nv04_vbuf_render_triangles_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_QUAD_STRIP:
+		case PIPE_PRIM_TRIANGLE_STRIP:
+			nv04_vbuf_render_tri_strip_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_TRIANGLE_FAN:
+		case PIPE_PRIM_POLYGON:
+			nv04_vbuf_render_tri_fan_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_QUADS:
+			nv04_vbuf_render_quads_elts(nv04_render, indices, nr_indices);
+			break;
+		default:
+			NOUVEAU_ERR("You have to implement primitive %d, young padawan\n", nv04_render->prim);
+			break;
+	}
+}
+
+
+static void
+nv04_vbuf_render_release_vertices( struct vbuf_render *render,
+		void *vertices, 
+		unsigned vertex_size,
+		unsigned vertices_used )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	free(nv04_render->buffer);
+	nv04_render->buffer = NULL;
+}
+
+
+static void
+nv04_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+	FREE(nv04_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv04_vbuf_render_create( struct nv04_context *nv04 )
+{
+	struct nv04_vbuf_render *nv04_render = CALLOC_STRUCT(nv04_vbuf_render);
+
+	nv04_render->nv04 = nv04;
+
+	nv04_render->base.max_vertex_buffer_bytes = VERTEX_BUFFER_SIZE;
+	nv04_render->base.max_indices = 65536; 
+	nv04_render->base.get_vertex_info = nv04_vbuf_render_get_vertex_info;
+	nv04_render->base.allocate_vertices = nv04_vbuf_render_allocate_vertices;
+	nv04_render->base.set_primitive = nv04_vbuf_render_set_primitive;
+	nv04_render->base.draw = nv04_vbuf_render_draw;
+	nv04_render->base.release_vertices = nv04_vbuf_render_release_vertices;
+	nv04_render->base.destroy = nv04_vbuf_render_destroy;
+
+	return &nv04_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv04_draw_vbuf_stage( struct nv04_context *nv04 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv04_vbuf_render_create(nv04);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv04->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
new file mode 100644
index 0000000000..9ef38bc244
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -0,0 +1,237 @@
+#include "pipe/p_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_simple_screen.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static const char *
+nv04_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv04_screen *nv04screen = nv04_screen(screen);
+	struct nouveau_device *dev = nv04screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv04_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv04_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 1;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 0;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return 0;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv04_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 0.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 0.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 0.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 0.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv04_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv04_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	void *map;
+	struct nv04_miptree *nv04mt = (struct nv04_miptree *)surface->texture;
+
+	map = pipe_buffer_map(screen, nv04mt->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv04_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct nv04_miptree *nv04mt = (struct nv04_miptree *)surface->texture;
+
+	pipe_buffer_unmap(screen, nv04mt->buffer);
+}
+
+static void
+nv04_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv04_screen *screen = nv04_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->fahrenheit);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	FREE(pscreen);
+}
+
+static struct pipe_buffer *
+nv04_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv04_miptree *mt = (struct nv04_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+struct pipe_screen *
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv04_screen *screen = CALLOC_STRUCT(nv04_screen);
+	unsigned fahrenheit_class = 0, sub3d_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	if (chipset>=0x20) {
+		fahrenheit_class = 0;
+		sub3d_class = 0;
+	} else if (chipset>=0x10) {
+		fahrenheit_class = NV10_DX5_TEXTURED_TRIANGLE;
+		sub3d_class = NV10_CONTEXT_SURFACES_3D;
+	} else {
+		fahrenheit_class=NV04_DX5_TEXTURED_TRIANGLE;
+		sub3d_class = NV04_CONTEXT_SURFACES_3D;
+	}
+
+	if (!fahrenheit_class) {
+		NOUVEAU_ERR("Unknown nv04 chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv04_surface_buffer;
+
+	/* 3D object */
+	ret = nvws->grobj_alloc(nvws, fahrenheit_class, &screen->fahrenheit);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return NULL;
+	}
+
+	/* 3D surface object */
+	ret = nvws->grobj_alloc(nvws, sub3d_class, &screen->context_surfaces_3d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D surface object: %d\n", ret);
+		return NULL;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv04_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv04_screen_destroy;
+
+	screen->pipe.get_name = nv04_screen_get_name;
+	screen->pipe.get_vendor = nv04_screen_get_vendor;
+	screen->pipe.get_param = nv04_screen_get_param;
+	screen->pipe.get_paramf = nv04_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv04_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv04_surface_map;
+	screen->pipe.surface_unmap = nv04_surface_unmap;
+
+	nv04_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_screen.h b/src/gallium/drivers/nv04/nv04_screen.h
new file mode 100644
index 0000000000..540aec907b
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_screen.h
@@ -0,0 +1,27 @@
+#ifndef __NV04_SCREEN_H__
+#define __NV04_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04_surface_2d.h"
+
+struct nv04_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+	unsigned chipset;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *fahrenheit;
+	struct nouveau_grobj *context_surfaces_3d;
+	struct nouveau_notifier *sync;
+
+};
+
+static INLINE struct nv04_screen *
+nv04_screen(struct pipe_screen *screen)
+{
+	return (struct nv04_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_state.c b/src/gallium/drivers/nv04/nv04_state.c
new file mode 100644
index 0000000000..87c635f962
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state.c
@@ -0,0 +1,458 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+static void *
+nv04_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv04_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv04_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_src = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dst = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+	
+
+	return (void *)cb;
+}
+
+static void
+nv04_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->blend = (struct nv04_blend_state*)blend;
+
+	nv04->dirty |= NV04_NEW_BLEND;
+}
+
+static void
+nv04_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+	}
+	return ret >> NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT;
+}
+
+static void *
+nv04_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+
+	struct nv04_sampler_state *ss;
+	uint32_t filter = 0;
+
+	ss = MALLOC(sizeof(struct nv04_sampler_state));
+
+	ss->format = ((wrap_mode(cso->wrap_s) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT));
+
+	if (cso->max_anisotropy > 1.0) {
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE | NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE;
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ss->filter = filter;
+
+	return (void *)ss;
+}
+
+static void
+nv04_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv04->sampler[unit] = sampler[unit];
+		nv04->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv04_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void
+nv04_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv04->tex_miptree[unit] = (struct nv04_miptree *)miptree[unit];
+		nv04->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv04_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv04_rasterizer_state *rs;
+
+	/*XXX: ignored:
+	 * 	scissor
+	 * 	points/lines (no hw support, emulated with tris in gallium)
+	 */
+	rs = MALLOC(sizeof(struct nv04_rasterizer_state));
+
+	rs->blend = cso->flatshade ? NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT : NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD;
+
+	return (void *)rs;
+}
+
+static void
+nv04_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->rast = (struct nv04_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv04->draw, (nv04->rast ? nv04->rast->templ : NULL));
+
+	nv04->dirty |= NV04_NEW_RAST | NV04_NEW_BLEND;
+}
+
+static void
+nv04_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static INLINE uint32_t nv04_compare_func(uint32_t f)
+{
+	switch ( f ) {
+		case PIPE_FUNC_NEVER:		return 1;
+		case PIPE_FUNC_LESS:		return 2;
+		case PIPE_FUNC_EQUAL:		return 3;
+		case PIPE_FUNC_LEQUAL:		return 4;
+		case PIPE_FUNC_GREATER:		return 5;
+		case PIPE_FUNC_NOTEQUAL:	return 6;
+		case PIPE_FUNC_GEQUAL:		return 7;
+		case PIPE_FUNC_ALWAYS:		return 8;
+	}
+	NOUVEAU_MSG("Unable to find the function\n");
+	return 0;
+}
+
+static void *
+nv04_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv04_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv04_depth_stencil_alpha_state));
+
+	hw->control = float_to_ubyte(cso->alpha.ref_value);
+	hw->control |= ( nv04_compare_func(cso->alpha.func) << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT );
+	hw->control |= cso->alpha.enabled ? NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_TEST_ENABLE : 0;
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ORIGIN;
+	hw->control |= cso->depth.enabled ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_SHIFT) : 0;
+	hw->control |= ( nv04_compare_func(cso->depth.func)<< NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT );
+	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT; // no culling, handled by the draw module
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE;
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE;
+	hw->control |= cso->depth.writemask ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_SHIFT) : 0;
+	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT; // integer zbuffer format
+
+	return (void *)hw;
+}
+
+static void
+nv04_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->dsa = hwcso;
+	nv04->dirty |= NV04_NEW_CONTROL;
+}
+
+static void
+nv04_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv04_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	return draw_create_vertex_shader(nv04->draw, templ);
+}
+
+static void
+nv04_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_bind_vertex_shader(nv04->draw, (struct draw_vertex_shader *) shader);
+
+	nv04->dirty |= NV04_NEW_VERTPROG;
+}
+
+static void
+nv04_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_delete_vertex_shader(nv04->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv04_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv04_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv04_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	return (void *)fp;
+}
+
+static void
+nv04_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = hwcso;
+
+	nv04->fragprog.current = fp;
+	nv04->dirty |= NV04_NEW_FRAGPROG;
+}
+
+static void
+nv04_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = hwcso;
+
+	nv04_fragprog_destroy(nv04, fp);
+	free((void*)fp->pipe.tokens);
+	free(fp);
+}
+
+static void
+nv04_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+}
+
+static void
+nv04_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->buffer && buf->buffer->size &&
+                    (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv04->constbuf[shader], mapped, buf->buffer->size);
+			nv04->constbuf_nr[shader] =
+				buf->buffer->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv04_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	
+	nv04->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv04->dirty |= NV04_NEW_FRAMEBUFFER;
+}
+static void
+nv04_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv04_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+/*	struct nv04_context *nv04 = nv04_context(pipe);
+
+	// XXX
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void
+nv04_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *viewport)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->viewport = *viewport;
+
+	draw_set_viewport_state(nv04->draw, &nv04->viewport);
+}
+
+static void
+nv04_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+		       const struct pipe_vertex_buffer *buffers)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	memcpy(nv04->vtxbuf, buffers, count * sizeof(buffers[0]));
+	nv04->dirty |= NV04_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv04->draw, count, buffers);
+}
+
+static void
+nv04_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_element *elements)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	memcpy(nv04->vtxelt, elements, sizeof(*elements) * count);
+	nv04->dirty |= NV04_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv04->draw, count, elements);
+}
+
+void
+nv04_init_state_functions(struct nv04_context *nv04)
+{
+	nv04->pipe.create_blend_state = nv04_blend_state_create;
+	nv04->pipe.bind_blend_state = nv04_blend_state_bind;
+	nv04->pipe.delete_blend_state = nv04_blend_state_delete;
+
+	nv04->pipe.create_sampler_state = nv04_sampler_state_create;
+	nv04->pipe.bind_sampler_states = nv04_sampler_state_bind;
+	nv04->pipe.delete_sampler_state = nv04_sampler_state_delete;
+	nv04->pipe.set_sampler_textures = nv04_set_sampler_texture;
+
+	nv04->pipe.create_rasterizer_state = nv04_rasterizer_state_create;
+	nv04->pipe.bind_rasterizer_state = nv04_rasterizer_state_bind;
+	nv04->pipe.delete_rasterizer_state = nv04_rasterizer_state_delete;
+
+	nv04->pipe.create_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_create;
+	nv04->pipe.bind_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_bind;
+	nv04->pipe.delete_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_delete;
+
+	nv04->pipe.create_vs_state = nv04_vp_state_create;
+	nv04->pipe.bind_vs_state = nv04_vp_state_bind;
+	nv04->pipe.delete_vs_state = nv04_vp_state_delete;
+
+	nv04->pipe.create_fs_state = nv04_fp_state_create;
+	nv04->pipe.bind_fs_state = nv04_fp_state_bind;
+	nv04->pipe.delete_fs_state = nv04_fp_state_delete;
+
+	nv04->pipe.set_blend_color = nv04_set_blend_color;
+	nv04->pipe.set_clip_state = nv04_set_clip_state;
+	nv04->pipe.set_constant_buffer = nv04_set_constant_buffer;
+	nv04->pipe.set_framebuffer_state = nv04_set_framebuffer_state;
+	nv04->pipe.set_polygon_stipple = nv04_set_polygon_stipple;
+	nv04->pipe.set_scissor_state = nv04_set_scissor_state;
+	nv04->pipe.set_viewport_state = nv04_set_viewport_state;
+
+	nv04->pipe.set_vertex_buffers = nv04_set_vertex_buffers;
+	nv04->pipe.set_vertex_elements = nv04_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_state.h b/src/gallium/drivers/nv04/nv04_state.h
new file mode 100644
index 0000000000..15d4685ec1
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state.h
@@ -0,0 +1,74 @@
+#ifndef __NV04_STATE_H__
+#define __NV04_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv04_blend_state {
+	uint32_t b_enable;
+	uint32_t b_src;
+	uint32_t b_dst;
+};
+
+struct nv04_fragtex_state {
+	uint32_t format;
+};
+
+struct nv04_sampler_state {
+	uint32_t filter;
+	uint32_t format;
+};
+
+struct nv04_depth_stencil_alpha_state {
+	uint32_t control;
+};
+
+struct nv04_rasterizer_state {
+	uint32_t blend;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv04_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct pipe_texture *shadow_tex;
+	struct pipe_surface *shadow_surface;
+
+	struct {
+		uint pitch;
+		uint image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+struct nv04_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv04_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv04_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_state_emit.c b/src/gallium/drivers/nv04/nv04_state_emit.c
new file mode 100644
index 0000000000..bd8ef1adbf
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state_emit.c
@@ -0,0 +1,223 @@
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+static void nv04_vertex_layout(struct pipe_context* pipe)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = nv04->fragprog.current;
+	uint32_t src = 0;
+	int i;
+	struct vertex_info vinfo;
+
+	memset(&vinfo, 0, sizeof(vinfo));
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		int isn = fp->info.input_semantic_name[i];
+		int isi = fp->info.input_semantic_index[i];
+		switch (isn) {
+			case TGSI_SEMANTIC_POSITION:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			default:
+			case TGSI_SEMANTIC_GENERIC:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+			case TGSI_SEMANTIC_FOG:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+		}
+	}
+
+	printf("%d vertex input\n",fp->info.num_inputs);
+	draw_compute_vertex_size(&vinfo);
+}
+
+static uint32_t nv04_blend_func(uint32_t f)
+{
+	switch ( f ) {
+		case PIPE_BLENDFACTOR_ZERO:			return 0x1;
+		case PIPE_BLENDFACTOR_ONE:			return 0x2;
+		case PIPE_BLENDFACTOR_SRC_COLOR:		return 0x3;
+		case PIPE_BLENDFACTOR_INV_SRC_COLOR:		return 0x4;
+		case PIPE_BLENDFACTOR_SRC_ALPHA:		return 0x5;
+		case PIPE_BLENDFACTOR_INV_SRC_ALPHA:		return 0x6;
+		case PIPE_BLENDFACTOR_DST_ALPHA:		return 0x7;
+		case PIPE_BLENDFACTOR_INV_DST_ALPHA:		return 0x8;
+		case PIPE_BLENDFACTOR_DST_COLOR:		return 0x9;
+		case PIPE_BLENDFACTOR_INV_DST_COLOR:		return 0xA;
+		case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:	return 0xB;
+	}
+	NOUVEAU_MSG("Unable to find the blend function 0x%x\n",f);
+	return 0;
+}
+
+static void nv04_emit_control(struct nv04_context* nv04)
+{
+	uint32_t control = nv04->dsa->control;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(control);
+}
+
+static void nv04_emit_blend(struct nv04_context* nv04)
+{
+	uint32_t blend;
+
+	blend=0x4; // texture MODULATE_ALPHA
+	blend|=0x20; // alpha is MSB
+	blend|=(2<<6); // flat shading
+	blend|=(1<<8); // persp correct
+	blend|=(0<<16); // no fog
+	blend|=(nv04->blend->b_enable<<20);
+	blend|=(nv04_blend_func(nv04->blend->b_src)<<24);
+	blend|=(nv04_blend_func(nv04->blend->b_dst)<<28);
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+	OUT_RING(blend);
+}
+
+static void nv04_emit_sampler(struct nv04_context *nv04, int unit)
+{
+	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
+	struct pipe_texture *pt = &nv04mt->base;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 3);
+	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING(nv04->sampler[unit]->filter);
+}
+
+static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
+{
+	struct pipe_framebuffer_state* fb = nv04->framebuffer;
+	struct pipe_surface *rt, *zeta;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+	struct nv04_miptree *nv04mt = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format = 0x108;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format = 0x103;
+		break;
+	default:
+		assert(0);
+	}
+
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
+	OUT_RING(rt_format);
+
+	nv04mt = (struct nv04_miptree *)rt->texture;
+	/* FIXME pitches have to be aligned ! */
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(rt->stride|(zeta->stride<<16));
+	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	if (fb->zsbuf) {
+		nv04mt = (struct nv04_miptree *)zeta->texture;
+		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}
+}
+
+void
+nv04_emit_hw_state(struct nv04_context *nv04)
+{
+	int i;
+
+	if (nv04->dirty & NV04_NEW_VERTPROG) {
+		//nv04_vertprog_bind(nv04, nv04->vertprog.current);
+		nv04->dirty &= ~NV04_NEW_VERTPROG;
+	}
+
+	if (nv04->dirty & NV04_NEW_FRAGPROG) {
+		nv04_fragprog_bind(nv04, nv04->fragprog.current);
+		nv04->dirty &= ~NV04_NEW_FRAGPROG;
+		nv04->dirty_samplers |= (1<<10);
+		nv04->dirty_samplers = 0;
+	}
+
+	if (nv04->dirty & NV04_NEW_CONTROL) {
+		nv04->dirty &= ~NV04_NEW_CONTROL;
+
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+		OUT_RING(nv04->dsa->control);
+	}
+
+	if (nv04->dirty & NV04_NEW_BLEND) {
+		nv04->dirty &= ~NV04_NEW_BLEND;
+
+		nv04_emit_blend(nv04);
+	}
+
+	if (nv04->dirty & NV04_NEW_VTXARRAYS) {
+		nv04->dirty &= ~NV04_NEW_VTXARRAYS;
+		nv04_vertex_layout(nv04);
+	}
+
+	if (nv04->dirty & NV04_NEW_SAMPLER) {
+		nv04->dirty &= ~NV04_NEW_SAMPLER;
+
+		nv04_emit_sampler(nv04, 0);
+	}
+
+	if (nv04->dirty & NV04_NEW_VIEWPORT) {
+		nv04->dirty &= ~NV04_NEW_VIEWPORT;
+//		nv04_state_emit_viewport(nv04);
+	}
+
+ 	if (nv04->dirty & NV04_NEW_FRAMEBUFFER) {
+		nv04->dirty &= ~NV04_NEW_FRAMEBUFFER;
+		nv04_state_emit_framebuffer(nv04);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv04_vbo.c
+	 */
+
+	/* Render target */
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(nv04->rt->stride|(nv04->zeta->stride<<16));
+	OUT_RELOCl(nv04->rt, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	if (nv04->zeta) {
+		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(nv04->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}
+
+	/* Texture images */
+	for (i = 0; i < 1; i++) {
+		if (!(nv04->fp_samplers & (1 << i)))
+			continue;
+		struct nv04_miptree *nv04mt = nv04->tex_miptree[i];
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 2);
+		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	}
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_surface.c b/src/gallium/drivers/nv04/nv04_surface.c
new file mode 100644
index 0000000000..14abf16679
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv04_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv04_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_surface_2d *eng2d = nv04->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv04_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_surface_2d *eng2d = nv04->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv04_init_surface_functions(struct nv04_context *nv04)
+{
+	nv04->pipe.surface_copy = nv04_surface_copy;
+	nv04->pipe.surface_fill = nv04_surface_fill;
+}
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
new file mode 100644
index 0000000000..230cfd17dd
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -0,0 +1,448 @@
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+#include "util/u_memory.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_util.h"
+#include "nv04_surface_2d.h"
+
+static INLINE int
+nv04_surface_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_rect_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_scaled_image_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A1R5G5B5_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+	case PIPE_FORMAT_R16_SNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
+	default:
+		return -1;
+	}
+}
+
+static INLINE unsigned
+nv04_swizzle_bits(unsigned x, unsigned y)
+{
+	unsigned u = (x & 0x001) << 0 |
+	             (x & 0x002) << 1 |
+	             (x & 0x004) << 2 |
+	             (x & 0x008) << 3 |
+	             (x & 0x010) << 4 |
+	             (x & 0x020) << 5 |
+	             (x & 0x040) << 6 |
+	             (x & 0x080) << 7 |
+	             (x & 0x100) << 8 |
+	             (x & 0x200) << 9 |
+	             (x & 0x400) << 10 |
+	             (x & 0x800) << 11;
+
+	unsigned v = (y & 0x001) << 1 |
+	             (y & 0x002) << 2 |
+	             (y & 0x004) << 3 |
+	             (y & 0x008) << 4 |
+	             (y & 0x010) << 5 |
+	             (y & 0x020) << 6 |
+	             (y & 0x040) << 7 |
+	             (y & 0x080) << 8 |
+	             (y & 0x100) << 9 |
+	             (y & 0x200) << 10 |
+	             (y & 0x400) << 11 |
+	             (y & 0x800) << 12;
+	return v | u;
+}
+
+static int
+nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
+			  struct pipe_surface *dst, int dx, int dy,
+			  struct pipe_surface *src, int sx, int sy,
+			  int w, int h)
+{
+	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_grobj *swzsurf = ctx->swzsurf;
+	struct nouveau_grobj *sifm = ctx->sifm;
+	struct nouveau_bo *src_bo = ctx->nvws->get_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	const unsigned max_w = 1024;
+	const unsigned max_h = 1024;
+	const unsigned sub_w = w > max_w ? max_w : w;
+	const unsigned sub_h = h > max_h ? max_h : h;
+	unsigned cx;
+	unsigned cy;
+
+	/* POT or GTFO */
+	assert(!(w & (w - 1)) && !(h & (h - 1)));
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, dst_bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
+	OUT_RING  (chan, nv04_surface_format(dst->format) |
+	                 log2i(w) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+	                 log2i(h) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+ 
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, src_bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, swzsurf->handle);
+
+	for (cy = 0; cy < h; cy += sub_h) {
+	  for (cx = 0; cx < w; cx += sub_w) {
+	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
+	    OUT_RELOCl(chan, dst_bo, dst->offset + nv04_swizzle_bits(cx, cy) *
+			     dst->block.size, NOUVEAU_BO_GART |
+			     NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	    BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
+	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
+	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	    OUT_RING  (chan, 0);
+	    OUT_RING  (chan, sub_h << 16 | sub_w);
+	    OUT_RING  (chan, 0);
+	    OUT_RING  (chan, sub_h << 16 | sub_w);
+	    OUT_RING  (chan, 1 << 20);
+	    OUT_RING  (chan, 1 << 20);
+
+	    BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
+	    OUT_RING  (chan, sub_h << 16 | sub_w);
+	    OUT_RING  (chan, src->stride |
+			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
+			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	    OUT_RELOCl(chan, src_bo, src->offset + cy * src->stride +
+			     cx * src->block.size, NOUVEAU_BO_GART |
+			     NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	    OUT_RING  (chan, 0);
+	  }
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
+		       struct pipe_surface *dst, int dx, int dy,
+		       struct pipe_surface *src, int sx, int sy, int w, int h)
+{
+	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+	struct nouveau_bo *src_bo = ctx->nvws->get_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	unsigned dst_offset, src_offset;
+
+	dst_offset = dst->offset + (dy * dst->stride) + (dx * dst->block.size);
+	src_offset = src->offset + (sy * src->stride) + (sx * src->block.size);
+
+	WAIT_RING (chan, 3 + ((h / 2047) + 1) * 9);
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, src_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	while (h) {
+		int count = (h > 2047) ? 2047 : h;
+
+		BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+		OUT_RELOCl(chan, src_bo, src_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCl(chan, dst_bo, dst_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+		OUT_RING  (chan, src->stride);
+		OUT_RING  (chan, dst->stride);
+		OUT_RING  (chan, w * src->block.size);
+		OUT_RING  (chan, count);
+		OUT_RING  (chan, 0x0101);
+		OUT_RING  (chan, 0);
+
+		h -= count;
+		src_offset += src->stride * count;
+		dst_offset += dst->stride * count;
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		       int w, int h)
+{
+	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *blit = ctx->blit;
+	struct nouveau_bo *src_bo = ctx->nvws->get_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	int format;
+
+	format = nv04_surface_format(dst->format);
+	if (format < 0)
+		return 1;
+
+	WAIT_RING (chan, 12);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, src_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst->stride << 16) | src->stride);
+	OUT_RELOCl(chan, src_bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, blit, 0x0300, 3);
+	OUT_RING  (chan, (sy << 16) | sx);
+	OUT_RING  (chan, (dy << 16) | dx);
+	OUT_RING  (chan, ( h << 16) |  w);
+
+	return 0;
+}
+
+static void
+nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		  int w, int h)
+{
+	int src_linear = src->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR;
+	int dst_linear = dst->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR;
+
+	assert(src->format == dst->format);
+
+	/* Setup transfer to swizzle the texture to vram if needed */
+	if (src_linear && !dst_linear && w > 1 && h > 1) {
+		nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h);
+		return;
+	}
+
+	/* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
+	 * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
+	 */
+	if ((src->offset & 63) || (dst->offset & 63) ||
+	    (src->stride & 63) || (dst->stride & 63)) {
+		nv04_surface_copy_m2mf(ctx, dst, dx, dy, src, sx, sy, w, h);
+		return;
+	}
+
+	nv04_surface_copy_blit(ctx, dst, dx, dy, src, sx, sy, w, h);
+}
+
+static void
+nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, int w, int h, unsigned value)
+{
+	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *rect = ctx->rect;
+	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	int cs2d_format, gdirect_format;
+
+	cs2d_format = nv04_surface_format(dst->format);
+	assert(cs2d_format >= 0);
+
+	gdirect_format = nv04_rect_format(dst->format);
+	assert(gdirect_format >= 0);
+
+	WAIT_RING (chan, 16);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst->stride << 16) | dst->stride);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect,
+		   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dx << 16) | dy);
+	OUT_RING  (chan, ( w << 16) |  h);
+}
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **pctx)
+{
+	struct nv04_surface_2d *ctx;
+
+	if (!pctx || !*pctx)
+		return;
+	ctx = *pctx;
+	*pctx = NULL;
+
+	nouveau_notifier_free(&ctx->ntfy);
+	nouveau_grobj_free(&ctx->m2mf);
+	nouveau_grobj_free(&ctx->surf2d);
+	nouveau_grobj_free(&ctx->swzsurf);
+	nouveau_grobj_free(&ctx->rect);
+	nouveau_grobj_free(&ctx->blit);
+	nouveau_grobj_free(&ctx->sifm);
+
+	FREE(ctx);
+}
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_winsys *nvws)
+{
+	struct nv04_surface_2d *ctx = CALLOC_STRUCT(nv04_surface_2d);
+	struct nouveau_channel *chan = nvws->channel;
+	unsigned handle = 0x88000000, class;
+	int ret;
+
+	if (!ctx)
+		return NULL;
+
+	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_CONTEXT_SURFACES_2D;
+	else
+		class = NV10_CONTEXT_SURFACES_2D;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->surf2d,
+			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_IMAGE_BLIT;
+	else
+		class = NV12_IMAGE_BLIT;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV04_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
+				  &ctx->rect);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, ctx->rect,
+			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	if (chan->device->chipset < 0x10) {
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+	} else
+	if (chan->device->chipset < 0x40) {
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+	} else {
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ctx->nvws = nvws;
+	ctx->copy = nv04_surface_copy;
+	ctx->fill = nv04_surface_fill;
+	return ctx;
+}
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.h b/src/gallium/drivers/nv04/nv04_surface_2d.h
new file mode 100644
index 0000000000..21b8f86960
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.h
@@ -0,0 +1,29 @@
+#ifndef __NV04_SURFACE_2D_H__
+#define __NV04_SURFACE_2D_H__
+
+struct nv04_surface_2d {
+	struct nouveau_winsys *nvws;
+	struct nouveau_notifier *ntfy;
+	struct nouveau_grobj *surf2d;
+	struct nouveau_grobj *swzsurf;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_grobj *rect;
+	struct nouveau_grobj *blit;
+	struct nouveau_grobj *sifm;
+
+	struct pipe_buffer *(*buf)(struct pipe_surface *);
+
+	void (*copy)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h);
+	void (*fill)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, int w, int h, unsigned value);
+};
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_winsys *nvws);
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **);
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_vbo.c b/src/gallium/drivers/nv04/nv04_vbo.c
new file mode 100644
index 0000000000..d21a0e34f7
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_vbo.c
@@ -0,0 +1,78 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv04_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv04_context *nv04 = nv04_context( pipe );
+	struct draw_context *draw = nv04->draw;
+	unsigned i;
+
+	nv04_emit_hw_state(nv04);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv04->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv04->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv04->constbuf[PIPE_SHADER_VERTEX],
+					nv04->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv04->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv04->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv04->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv04_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	printf("coucou in draw arrays\n");
+	return nv04_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv10/Makefile b/src/gallium/drivers/nv10/Makefile
new file mode 100644
index 0000000000..4ba7ce586d
--- /dev/null
+++ b/src/gallium/drivers/nv10/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv10
+
+DRIVER_SOURCES = \
+	nv10_clear.c \
+	nv10_context.c \
+	nv10_fragprog.c \
+	nv10_fragtex.c \
+	nv10_miptree.c \
+	nv10_prim_vbuf.c \
+	nv10_screen.c \
+	nv10_state.c \
+	nv10_state_emit.c \
+	nv10_surface.c \
+	nv10_vbo.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv10/nv10_clear.c b/src/gallium/drivers/nv10/nv10_clear.c
new file mode 100644
index 0000000000..be7e09cf4b
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv10_context.h"
+
+void
+nv10_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv10/nv10_context.c b/src/gallium/drivers/nv10/nv10_context.c
new file mode 100644
index 0000000000..ef2c0c5d9f
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_context.c
@@ -0,0 +1,296 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static void
+nv10_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_flush(nv10->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv10_destroy(struct pipe_context *pipe)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	if (nv10->draw)
+		draw_destroy(nv10->draw);
+
+	FREE(nv10);
+}
+
+static void nv10_init_hwctx(struct nv10_context *nv10)
+{
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+	int i;
+	float projectionmatrix[16];
+
+	BEGIN_RING(celsius, NV10TCL_DMA_NOTIFY, 1);
+	OUT_RING  (screen->sync->handle);
+	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle);
+	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY2, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle);
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+
+	for (i=1;i<8;i++) {
+		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (0);
+		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(celsius, 0x290, 1);
+	OUT_RING  ((0x10<<16)|1);
+	BEGIN_RING(celsius, 0x3f4, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	if (nv10->screen->celsius->grclass != NV10TCL) {
+		/* For nv11, nv17 */
+		BEGIN_RING(celsius, 0x120, 3);
+		OUT_RING  (0);
+		OUT_RING  (1);
+		OUT_RING  (2);
+
+		BEGIN_RING(celsius, NV10TCL_NOP, 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	/* Set state */
+	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_TX_ENABLE(0), 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_RC_IN_ALPHA(0), 12);
+	OUT_RING  (0x30141010);
+	OUT_RING  (0);
+	OUT_RING  (0x20040000);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0x18000000);
+	OUT_RING  (0x300e0300);
+	OUT_RING  (0x0c091c80);
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 2);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x8006);
+	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 8);
+	OUT_RING  (0xff);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	OUT_RING  (0xff);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1d01);
+	BEGIN_RING(celsius, NV10TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LIGHT_MODEL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_COLOR_CONTROL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING  (0x201);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (8);
+	BEGIN_RING(celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LINE_WIDTH, 1);
+	OUT_RING  (8);
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (0x1b02);
+	OUT_RING  (0x1b02);
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (0x405);
+	OUT_RING  (0x901);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_TX_GEN_S(0), 8);
+	for (i=0;i<8;i++) {
+		OUT_RING  (0);
+	}
+	BEGIN_RING(celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RING  (0x3fc00000);	/* -1.50 */
+	OUT_RING  (0xbdb8aa0a);	/* -0.09 */
+	OUT_RING  (0);		/*  0.00 */
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_FOG_MODE, 2);
+	OUT_RING  (0x802);
+	OUT_RING  (2);
+	/* for some reason VIEW_MATRIX_ENABLE need to be 6 instead of 4 when
+	 * using texturing, except when using the texture matrix
+	 */
+	BEGIN_RING(celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
+	OUT_RING  (6);
+	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (0x01010101);
+
+	/* Set vertex component */
+	BEGIN_RING(celsius, NV10TCL_VERTEX_COL_4F_R, 4);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_FOG_1F, 1);
+	OUT_RINGf (0.0);
+	BEGIN_RING(celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (1);
+
+	memset(projectionmatrix, 0, sizeof(projectionmatrix));
+	BEGIN_RING(celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
+	projectionmatrix[0*4+0] = 1.0;
+	projectionmatrix[1*4+1] = 1.0;
+	projectionmatrix[2*4+2] = 1.0;
+	projectionmatrix[3*4+3] = 1.0;
+	for (i=0;i<16;i++) {
+		OUT_RINGf  (projectionmatrix[i]);
+	}
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RING  (0.0);
+	OUT_RINGf  (16777216.0);
+
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_SCALE_X, 4);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (16777215.0 * 0.5);
+	OUT_RING  (0);
+
+	FIRE_RING (NULL);
+}
+
+static void
+nv10_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv10_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv10_screen *screen = nv10_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv10_context *nv10;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv10 = CALLOC(1, sizeof(struct nv10_context));
+	if (!nv10)
+		return NULL;
+	nv10->screen = screen;
+	nv10->pctx_id = pctx_id;
+
+	nv10->nvws = nvws;
+
+	nv10->pipe.winsys = ws;
+	nv10->pipe.screen = pscreen;
+	nv10->pipe.destroy = nv10_destroy;
+	nv10->pipe.set_edgeflags = nv10_set_edgeflags;
+	nv10->pipe.draw_arrays = nv10_draw_arrays;
+	nv10->pipe.draw_elements = nv10_draw_elements;
+	nv10->pipe.clear = nv10_clear;
+	nv10->pipe.flush = nv10_flush;
+
+	nv10_init_surface_functions(nv10);
+	nv10_init_state_functions(nv10);
+
+	nv10->draw = draw_create();
+	assert(nv10->draw);
+	draw_set_rasterize_stage(nv10->draw, nv10_draw_vbuf_stage(nv10));
+
+	nv10_init_hwctx(nv10);
+
+	return &nv10->pipe;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_context.h b/src/gallium/drivers/nv10/nv10_context.h
new file mode 100644
index 0000000000..f3b56de25a
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_context.h
@@ -0,0 +1,153 @@
+#ifndef __NV10_CONTEXT_H__
+#define __NV10_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv10_screen *ctx = nv10->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv10_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV10_NEW_VERTPROG	(1 << 0)
+#define NV10_NEW_FRAGPROG	(1 << 1)
+#define NV10_NEW_VTXARRAYS	(1 << 2)
+#define NV10_NEW_BLEND		(1 << 3)
+#define NV10_NEW_BLENDCOL	(1 << 4)
+#define NV10_NEW_RAST 		(1 << 5)
+#define NV10_NEW_DSA  		(1 << 6)
+#define NV10_NEW_VIEWPORT	(1 << 7)
+#define NV10_NEW_SCISSOR	(1 << 8)
+#define NV10_NEW_FRAMEBUFFER	(1 << 9)
+
+#include "nv10_screen.h"
+
+struct nv10_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv10_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	uint32_t dirty;
+
+	struct nv10_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv10_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+	uint32_t lma_offset;
+
+	struct nv10_blend_state *blend;
+	struct pipe_blend_color *blend_color;
+	struct nv10_rasterizer_state *rast;
+	struct nv10_depth_stencil_alpha_state *dsa;
+	struct pipe_viewport_state *viewport;
+	struct pipe_scissor_state *scissor;
+	struct pipe_framebuffer_state *framebuffer;
+
+	//struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[2];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+/*	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv10_vertex_program *active;
+
+		struct nv10_vertex_program *current;
+	} vertprog;
+*/
+	struct {
+		struct nv10_fragment_program *active;
+
+		struct nv10_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+};
+
+static INLINE struct nv10_context *
+nv10_context(struct pipe_context *pipe)
+{
+	return (struct nv10_context *)pipe;
+}
+
+extern void nv10_init_state_functions(struct nv10_context *nv10);
+extern void nv10_init_surface_functions(struct nv10_context *nv10);
+
+extern void nv10_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv10_clear.c */
+extern void nv10_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv10_draw.c */
+extern struct draw_stage *nv10_draw_render_stage(struct nv10_context *nv10);
+
+/* nv10_fragprog.c */
+extern void nv10_fragprog_bind(struct nv10_context *,
+			       struct nv10_fragment_program *);
+extern void nv10_fragprog_destroy(struct nv10_context *,
+				  struct nv10_fragment_program *);
+
+/* nv10_fragtex.c */
+extern void nv10_fragtex_bind(struct nv10_context *);
+
+/* nv10_prim_vbuf.c */
+struct draw_stage *nv10_draw_vbuf_stage( struct nv10_context *nv10 );
+extern void nv10_vtxbuf_bind(struct nv10_context* nv10);
+
+/* nv10_state.c and friends */
+extern void nv10_emit_hw_state(struct nv10_context *nv10);
+extern void nv10_state_tex_update(struct nv10_context *nv10);
+
+/* nv10_vbo.c */
+extern boolean nv10_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv10_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_fragprog.c b/src/gallium/drivers/nv10/nv10_fragprog.c
new file mode 100644
index 0000000000..698db5a16a
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv10_context.h"
+
+void
+nv10_fragprog_bind(struct nv10_context *nv10, struct nv10_fragment_program *fp)
+{
+}
+
+void
+nv10_fragprog_destroy(struct nv10_context *nv10,
+		      struct nv10_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_fragtex.c b/src/gallium/drivers/nv10/nv10_fragtex.c
new file mode 100644
index 0000000000..27f2f87584
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_fragtex.c
@@ -0,0 +1,124 @@
+#include "nv10_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV10TCL_TX_FORMAT_FORMAT_##tf,                                               \
+}
+
+struct nv10_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+};
+
+static struct nv10_texture_format
+nv10_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM      , L8      ),
+	_(A8_UNORM      , A8      ),
+	_(A8L8_UNORM    , A8L8    ),
+//	_(RGB_DXT1      , DXT1,   ),
+//	_(RGBA_DXT1     , DXT1,   ),
+//	_(RGBA_DXT3     , DXT3,   ),
+//	_(RGBA_DXT5     , DXT5,   ),
+	{},
+};
+
+static struct nv10_texture_format *
+nv10_fragtex_format(uint pipe_format)
+{
+	struct nv10_texture_format *tf = nv10_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+
+static void
+nv10_fragtex_build(struct nv10_context *nv10, int unit)
+{
+#if 0
+	struct nv10_sampler_state *ps = nv10->tex_sampler[unit];
+	struct nv10_miptree *nv10mt = nv10->tex_miptree[unit];
+	struct pipe_texture *pt = &nv10mt->base;
+	struct nv10_texture_format *tf;
+	uint32_t txf, txs, txp;
+
+	tf = nv10_fragtex_format(pt->format);
+	if (!tf || !tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", pt->format);
+		return;
+	}
+
+	txf  = tf->format << 8;
+	txf |= (pt->last_level + 1) << 16;
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= 8;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV10TCL_TX_FORMAT_CUBE_MAP;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= (2<<4);
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= (1<<4);
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(nv10mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv10mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (0x40000000); /* enable */
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x2000 /* magic */);
+	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
+	OUT_RING  (ps->bcol);
+#endif
+}
+
+void
+nv10_fragtex_bind(struct nv10_context *nv10)
+{
+#if 0
+	struct nv10_fragment_program *fp = nv10->fragprog.active;
+	unsigned samplers, unit;
+
+	samplers = nv10->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		BEGIN_RING(celsius, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (0);
+	}
+
+	samplers = nv10->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		nv10_fragtex_build(nv10, unit);
+	}
+
+	nv10->fp_samplers = fp->samplers;
+#endif
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_miptree.c b/src/gallium/drivers/nv10/nv10_miptree.c
new file mode 100644
index 0000000000..9616135461
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_miptree.c
@@ -0,0 +1,174 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static void
+nv10_miptree_layout(struct nv10_miptree *nv10mt)
+{
+	struct pipe_texture *pt = &nv10mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else {
+		nr_faces = 1;
+	}
+	
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			nv10mt->level[l].pitch = pt->nblocksx[l] * pt->block.size;
+		else
+			nv10mt->level[l].pitch = pt->nblocksx[0] * pt->block.size;
+		nv10mt->level[l].pitch = (nv10mt->level[l].pitch + 63) & ~63;
+
+		nv10mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv10mt->level[l].image_offset[f] = offset;
+			offset += nv10mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv10mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv10_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv10_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv10_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv10_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv10_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv10_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	nv10_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+					   mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv10_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv10_miptree *nv10mt = (struct nv10_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv10mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv10mt->level[l].image_offset)
+				FREE(nv10mt->level[l].image_offset);
+		}
+		FREE(nv10mt);
+	}
+}
+
+static void
+nv10_miptree_update(struct pipe_context *pipe, struct pipe_texture *mt,
+		    uint face, uint levels)
+{
+}
+
+
+static struct pipe_surface *
+nv10_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv10_miptree *nv10mt = (struct nv10_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv10mt->level[level].pitch;
+	ps->refcount = 1;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv10mt->level[level].image_offset[face];
+	} else {
+		ps->offset = nv10mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv10_miptree_surface_release(struct pipe_screen *screen,
+			     struct pipe_surface **surface)
+{
+}
+
+void nv10_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv10_miptree_create;
+	pscreen->texture_blanket = nv10_miptree_blanket;
+	pscreen->texture_release = nv10_miptree_release;
+	pscreen->get_tex_surface = nv10_miptree_surface_get;
+	pscreen->tex_surface_release = nv10_miptree_surface_release;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_prim_vbuf.c b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
new file mode 100644
index 0000000000..7435d87315
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
@@ -0,0 +1,245 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <jrfonseca@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+#include "draw/draw_vbuf.h"
+
+/**
+ * Primitive renderer for nv10.
+ */
+struct nv10_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv10_context *nv10;   
+
+	/** Vertex buffer */
+	struct pipe_buffer* buffer;
+
+	/** Vertex size in bytes */
+	unsigned vertex_size;
+
+	/** Hardware primitive */
+	unsigned hwprim;
+};
+
+
+void nv10_vtxbuf_bind( struct nv10_context* nv10 )
+{
+	int i;
+	for(i = 0; i < 8; i++) {
+		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET(i), 1);
+		OUT_RING(0/*nv10->vtxbuf*/);
+		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT(i) ,1);
+		OUT_RING(0/*XXX*/);
+	}
+}
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv10_vbuf_render *
+nv10_vbuf_render( struct vbuf_render *render )
+{
+	assert(render);
+	return (struct nv10_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+nv10_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+
+	nv10_emit_hw_state(nv10);
+
+	return &nv10->vertex_info;
+}
+
+
+static void *
+nv10_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_winsys *winsys = nv10->pipe.winsys;
+	size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+
+	assert(!nv10_render->buffer);
+	nv10_render->buffer = winsys->buffer_create(winsys, 64, PIPE_BUFFER_USAGE_VERTEX, size);
+
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	return winsys->buffer_map(winsys, 
+			nv10_render->buffer, 
+			PIPE_BUFFER_USAGE_CPU_WRITE);
+}
+
+
+static boolean
+nv10_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	unsigned hwp = nvgl_primitive(prim);
+	if (hwp == 0)
+		return FALSE;
+
+	nv10_render->hwprim = hwp;
+	return TRUE;
+}
+
+
+static void 
+nv10_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	int push, i;
+
+	nv10_emit_hw_state(nv10);
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(nv10_render->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(nv10_render->hwprim);
+
+	if (nr_indices & 1) {
+		BEGIN_RING(celsius, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (indices[0]);
+		indices++; nr_indices--;
+	}
+
+	while (nr_indices) {
+		// XXX too big/small ? check the size
+		push = MIN2(nr_indices, 1200 * 2);
+
+		BEGIN_RING_NI(celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((indices[i+1] << 16) | indices[i]);
+
+		nr_indices -= push;
+		indices  += push;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (0);
+}
+
+
+static void
+nv10_vbuf_render_release_vertices( struct vbuf_render *render,
+		void *vertices, 
+		unsigned vertex_size,
+		unsigned vertices_used )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_winsys *winsys = nv10->pipe.winsys;
+	struct pipe_screen *pscreen = &nv10->screen->pipe;
+
+	assert(nv10_render->buffer);
+	winsys->buffer_unmap(winsys, nv10_render->buffer);
+	pipe_buffer_reference(pscreen, &nv10_render->buffer, NULL);
+}
+
+
+static void
+nv10_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	FREE(nv10_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv10_vbuf_render_create( struct nv10_context *nv10 )
+{
+	struct nv10_vbuf_render *nv10_render = CALLOC_STRUCT(nv10_vbuf_render);
+
+	nv10_render->nv10 = nv10;
+
+	nv10_render->base.max_vertex_buffer_bytes = 16*1024;
+	nv10_render->base.max_indices = 1024;
+	nv10_render->base.get_vertex_info = nv10_vbuf_render_get_vertex_info;
+	nv10_render->base.allocate_vertices = nv10_vbuf_render_allocate_vertices;
+	nv10_render->base.set_primitive = nv10_vbuf_render_set_primitive;
+	nv10_render->base.draw = nv10_vbuf_render_draw;
+	nv10_render->base.release_vertices = nv10_vbuf_render_release_vertices;
+	nv10_render->base.destroy = nv10_vbuf_render_destroy;
+
+	return &nv10_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv10_draw_vbuf_stage( struct nv10_context *nv10 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv10_vbuf_render_create(nv10);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv10->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
new file mode 100644
index 0000000000..f417b06c94
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -0,0 +1,226 @@
+#include "pipe/p_screen.h"
+#include "util/u_simple_screen.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static const char *
+nv10_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv10_screen *nv10screen = nv10_screen(screen);
+	struct nouveau_device *dev = nv10screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv10_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv10_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 2;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv10_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 2.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv10_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv10_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+        struct nv10_miptree *nv10mt = (struct nv10_miptree *)surface->texture;
+
+	map = ws->buffer_map(ws, nv10mt->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv10_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+        struct nv10_miptree *nv10mt = (struct nv10_miptree *)surface->texture;
+
+	ws->buffer_unmap(ws, nv10mt->buffer);
+}
+
+static void
+nv10_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv10_screen *screen = nv10_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->celsius);
+
+	FREE(pscreen);
+}
+
+static struct pipe_buffer *
+nv10_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv10_miptree *mt = (struct nv10_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+struct pipe_screen *
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv10_screen *screen = CALLOC_STRUCT(nv10_screen);
+	unsigned celsius_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv10_surface_buffer;
+
+	/* 3D object */
+	if (chipset>=0x20)
+		celsius_class=NV11TCL;
+	else if (chipset>=0x17)
+		celsius_class=NV17TCL;
+	else if (chipset>=0x11)
+		celsius_class=NV11TCL;
+	else
+		celsius_class=NV10TCL;
+
+	if (!celsius_class) {
+		NOUVEAU_ERR("Unknown nv1x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, celsius_class, &screen->celsius);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv10_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv10_screen_destroy;
+
+	screen->pipe.get_name = nv10_screen_get_name;
+	screen->pipe.get_vendor = nv10_screen_get_vendor;
+	screen->pipe.get_param = nv10_screen_get_param;
+	screen->pipe.get_paramf = nv10_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv10_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv10_surface_map;
+	screen->pipe.surface_unmap = nv10_surface_unmap;
+
+	nv10_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_screen.h b/src/gallium/drivers/nv10/nv10_screen.h
new file mode 100644
index 0000000000..60102a369a
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_screen.h
@@ -0,0 +1,24 @@
+#ifndef __NV10_SCREEN_H__
+#define __NV10_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04/nv04_surface_2d.h"
+
+struct nv10_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *celsius;
+	struct nouveau_notifier *sync;
+};
+
+static INLINE struct nv10_screen *
+nv10_screen(struct pipe_screen *screen)
+{
+	return (struct nv10_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_state.c b/src/gallium/drivers/nv10/nv10_state.c
new file mode 100644
index 0000000000..119af66dfd
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state.c
@@ -0,0 +1,589 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+static void *
+nv10_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv10_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv10_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv10_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->blend = (struct nv10_blend_state*)blend;
+
+	nv10->dirty |= NV10_NEW_BLEND;
+}
+
+static void
+nv10_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV10TCL_TX_FORMAT_WRAP_S_SHIFT;
+}
+
+static void *
+nv10_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv10_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv10_sampler_state));
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV10TCL_TX_FORMAT_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV10TCL_TX_FORMAT_WRAP_T_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy > 1.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+/*		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_2X;
+		}*/
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+/*	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}*/
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv10_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv10->tex_sampler[unit] = sampler[unit];
+		nv10->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv10_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv10_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv10->tex_miptree[unit] = (struct nv10_miptree *)miptree[unit];
+		nv10->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv10_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv10_rasterizer_state *rs;
+	int i;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = MALLOC(sizeof(struct nv10_rasterizer_state));
+
+	rs->templ = cso;
+	
+	rs->shade_model = cso->flatshade ? 0x1d00 : 0x1d01;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = NV10TCL_FRONT_FACE_CCW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = NV10TCL_FRONT_FACE_CW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CCW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_BOTH:
+		rs->cull_face_en = 1;
+		rs->cull_face    = NV10TCL_CULL_FACE_FRONT_AND_BACK;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		rs->cull_face_en = 0;
+		rs->cull_face    = 0;
+		break;
+	}
+
+	if (cso->point_sprite) {
+		rs->point_sprite = (1 << 0);
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				rs->point_sprite |= (1 << (8 + i));
+		}
+	} else {
+		rs->point_sprite = 0;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv10_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->rast = (struct nv10_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv10->draw, (nv10->rast ? nv10->rast->templ : NULL));
+
+	nv10->dirty |= NV10_NEW_RAST;
+}
+
+static void
+nv10_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv10_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv10_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv10_depth_stencil_alpha_state));
+
+	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
+	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	hw->stencil.enable = cso->stencil[0].enabled ? 1 : 0;
+	hw->stencil.wmask = cso->stencil[0].writemask;
+	hw->stencil.func = nvgl_comparison_op(cso->stencil[0].func);
+	hw->stencil.ref	= cso->stencil[0].ref_value;
+	hw->stencil.vmask = cso->stencil[0].valuemask;
+	hw->stencil.fail = nvgl_stencil_op(cso->stencil[0].fail_op);
+	hw->stencil.zfail = nvgl_stencil_op(cso->stencil[0].zfail_op);
+	hw->stencil.zpass = nvgl_stencil_op(cso->stencil[0].zpass_op);
+
+	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
+	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
+	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref_value);
+
+	return (void *)hw;
+}
+
+static void
+nv10_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *dsa)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->dsa = (struct nv10_depth_stencil_alpha_state*)dsa;
+
+	nv10->dirty |= NV10_NEW_DSA;
+}
+
+static void
+nv10_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv10_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	return draw_create_vertex_shader(nv10->draw, templ);
+}
+
+static void
+nv10_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_bind_vertex_shader(nv10->draw, (struct draw_vertex_shader *) shader);
+
+	nv10->dirty |= NV10_NEW_VERTPROG;
+}
+
+static void
+nv10_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_delete_vertex_shader(nv10->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv10_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv10_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv10_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	
+	tgsi_scan_shader(cso->tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv10_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_fragment_program *fp = hwcso;
+
+	nv10->fragprog.current = fp;
+	nv10->dirty |= NV10_NEW_FRAGPROG;
+}
+
+static void
+nv10_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_fragment_program *fp = hwcso;
+
+	nv10_fragprog_destroy(nv10, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv10_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->blend_color = (struct pipe_blend_color*)bcol;
+
+	nv10->dirty |= NV10_NEW_BLENDCOL;
+}
+
+static void
+nv10_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_set_clip_state(nv10->draw, clip);
+}
+
+static void
+nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->buffer && buf->buffer->size &&
+                    (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv10->constbuf[shader], mapped, buf->buffer->size);
+			nv10->constbuf_nr[shader] =
+				buf->buffer->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv10_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv10->dirty |= NV10_NEW_FRAMEBUFFER;
+}
+
+static void
+nv10_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv10_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->scissor = (struct pipe_scissor_state*)s;
+
+	nv10->dirty |= NV10_NEW_SCISSOR;
+}
+
+static void
+nv10_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->viewport = (struct pipe_viewport_state*)vpt;
+
+	draw_set_viewport_state(nv10->draw, nv10->viewport);
+
+	nv10->dirty |= NV10_NEW_VIEWPORT;
+}
+
+static void
+nv10_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	memcpy(nv10->vtxbuf, vb, sizeof(*vb) * count);
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv10->draw, count, vb);
+}
+
+static void
+nv10_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	memcpy(nv10->vtxelt, ve, sizeof(*ve) * count);
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv10->draw, count, ve);
+}
+
+void
+nv10_init_state_functions(struct nv10_context *nv10)
+{
+	nv10->pipe.create_blend_state = nv10_blend_state_create;
+	nv10->pipe.bind_blend_state = nv10_blend_state_bind;
+	nv10->pipe.delete_blend_state = nv10_blend_state_delete;
+
+	nv10->pipe.create_sampler_state = nv10_sampler_state_create;
+	nv10->pipe.bind_sampler_states = nv10_sampler_state_bind;
+	nv10->pipe.delete_sampler_state = nv10_sampler_state_delete;
+	nv10->pipe.set_sampler_textures = nv10_set_sampler_texture;
+
+	nv10->pipe.create_rasterizer_state = nv10_rasterizer_state_create;
+	nv10->pipe.bind_rasterizer_state = nv10_rasterizer_state_bind;
+	nv10->pipe.delete_rasterizer_state = nv10_rasterizer_state_delete;
+
+	nv10->pipe.create_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_create;
+	nv10->pipe.bind_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_bind;
+	nv10->pipe.delete_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_delete;
+
+	nv10->pipe.create_vs_state = nv10_vp_state_create;
+	nv10->pipe.bind_vs_state = nv10_vp_state_bind;
+	nv10->pipe.delete_vs_state = nv10_vp_state_delete;
+
+	nv10->pipe.create_fs_state = nv10_fp_state_create;
+	nv10->pipe.bind_fs_state = nv10_fp_state_bind;
+	nv10->pipe.delete_fs_state = nv10_fp_state_delete;
+
+	nv10->pipe.set_blend_color = nv10_set_blend_color;
+	nv10->pipe.set_clip_state = nv10_set_clip_state;
+	nv10->pipe.set_constant_buffer = nv10_set_constant_buffer;
+	nv10->pipe.set_framebuffer_state = nv10_set_framebuffer_state;
+	nv10->pipe.set_polygon_stipple = nv10_set_polygon_stipple;
+	nv10->pipe.set_scissor_state = nv10_set_scissor_state;
+	nv10->pipe.set_viewport_state = nv10_set_viewport_state;
+
+	nv10->pipe.set_vertex_buffers = nv10_set_vertex_buffers;
+	nv10->pipe.set_vertex_elements = nv10_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_state.h b/src/gallium/drivers/nv10/nv10_state.h
new file mode 100644
index 0000000000..3a3fd0d4f4
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state.h
@@ -0,0 +1,139 @@
+#ifndef __NV10_STATE_H__
+#define __NV10_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv10_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv10_sampler_state {
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv10_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+	uint32_t point_sprite;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv10_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv10_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv10_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	struct nv10_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv10_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv10_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv10_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv10_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+struct nv10_depth_stencil_alpha_state {
+	struct {
+		uint32_t func;
+		uint32_t write_enable;
+		uint32_t test_enable;
+	} depth;
+
+	struct {
+		uint32_t enable;
+		uint32_t wmask;
+		uint32_t func;
+		uint32_t ref;
+		uint32_t vmask;
+		uint32_t fail;
+		uint32_t zfail;
+		uint32_t zpass;
+	} stencil;
+
+	struct {
+		uint32_t enabled;
+		uint32_t func;
+		uint32_t ref;
+	} alpha;
+};
+
+struct nv10_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_state_emit.c b/src/gallium/drivers/nv10/nv10_state_emit.c
new file mode 100644
index 0000000000..5dec618b93
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state_emit.c
@@ -0,0 +1,306 @@
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+static void nv10_state_emit_blend(struct nv10_context* nv10)
+{
+	struct nv10_blend_state *b = nv10->blend;
+
+	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 1);
+	OUT_RING  (b->d_enable);
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
+	OUT_RING  (b->b_enable);
+	OUT_RING  (b->b_srcfunc);
+	OUT_RING  (b->b_dstfunc);
+
+	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (b->c_mask);
+}
+
+static void nv10_state_emit_blend_color(struct nv10_context* nv10)
+{
+	struct pipe_blend_color *c = nv10->blend_color;
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+		   (float_to_ubyte(c->color[0]) << 16)|
+		   (float_to_ubyte(c->color[1]) << 8) |
+		   (float_to_ubyte(c->color[2]) << 0));
+}
+
+static void nv10_state_emit_rast(struct nv10_context* nv10)
+{
+	struct nv10_rasterizer_state *r = nv10->rast;
+
+	BEGIN_RING(celsius, NV10TCL_SHADE_MODEL, 2);
+	OUT_RING  (r->shade_model);
+	OUT_RING  (r->line_width);
+
+
+	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (r->point_size);
+
+	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (r->poly_mode_front);
+	OUT_RING  (r->poly_mode_back);
+
+
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (r->cull_face);
+	OUT_RING  (r->front_face);
+
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (r->line_smooth_en);
+	OUT_RING  (r->poly_smooth_en);
+
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (r->cull_face_en);
+}
+
+static void nv10_state_emit_dsa(struct nv10_context* nv10)
+{
+	struct nv10_depth_stencil_alpha_state *d = nv10->dsa;
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING (d->depth.func);
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (d->depth.write_enable);
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (d->depth.test_enable);
+
+#if 0
+	BEGIN_RING(celsius, NV10TCL_STENCIL_ENABLE, 1);
+	OUT_RING (d->stencil.enable);
+	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 7);
+	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+#endif
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (d->alpha.enabled);
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (d->alpha.func);
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (d->alpha.ref);
+}
+
+static void nv10_state_emit_viewport(struct nv10_context* nv10)
+{
+}
+
+static void nv10_state_emit_scissor(struct nv10_context* nv10)
+{
+	// XXX this is so not working
+/*	struct pipe_scissor_state *s = nv10->scissor;
+	BEGIN_RING(celsius, NV10TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
+{
+	struct pipe_framebuffer_state* fb = nv10->framebuffer;
+	struct pipe_surface *rt, *zeta = NULL;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+        struct nv10_miptree *nv10mt = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV10TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (zeta) {
+		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (zeta->stride << 16));
+	} else {
+		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (rt->stride << 16));
+	}
+
+	nv10mt = (struct nv10_miptree *)rt->texture;
+	nv10->rt[0] = nv10mt->buffer;
+
+	if (zeta_format)
+	{
+		nv10mt = (struct nv10_miptree *)zeta->texture;
+		nv10->zeta = nv10mt->buffer;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 3);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0);
+	OUT_RING  (rt_format);
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((w - 1) << 16) | 0 | 0x08000800);
+	OUT_RING  (((h - 1) << 16) | 0 | 0x08000800);
+}
+
+static void nv10_vertex_layout(struct nv10_context *nv10)
+{
+	struct nv10_fragment_program *fp = nv10->fragprog.current;
+	uint32_t src = 0;
+	int i;
+	struct vertex_info vinfo;
+
+	memset(&vinfo, 0, sizeof(vinfo));
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+			case TGSI_SEMANTIC_POSITION:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			default:
+			case TGSI_SEMANTIC_GENERIC:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+			case TGSI_SEMANTIC_FOG:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+		}
+	}
+	draw_compute_vertex_size(&vinfo);
+}
+
+void
+nv10_emit_hw_state(struct nv10_context *nv10)
+{
+	int i;
+
+	if (nv10->dirty & NV10_NEW_VERTPROG) {
+		//nv10_vertprog_bind(nv10, nv10->vertprog.current);
+		nv10->dirty &= ~NV10_NEW_VERTPROG;
+	}
+
+	if (nv10->dirty & NV10_NEW_FRAGPROG) {
+		nv10_fragprog_bind(nv10, nv10->fragprog.current);
+		/*XXX: clear NV10_NEW_FRAGPROG if no new program uploaded */
+		nv10->dirty_samplers |= (1<<10);
+		nv10->dirty_samplers = 0;
+	}
+
+	if (nv10->dirty_samplers || (nv10->dirty & NV10_NEW_FRAGPROG)) {
+		nv10_fragtex_bind(nv10);
+		nv10->dirty &= ~NV10_NEW_FRAGPROG;
+	}
+
+	if (nv10->dirty & NV10_NEW_VTXARRAYS) {
+		nv10->dirty &= ~NV10_NEW_VTXARRAYS;
+		nv10_vertex_layout(nv10);
+		nv10_vtxbuf_bind(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_BLEND) {
+		nv10->dirty &= ~NV10_NEW_BLEND;
+		nv10_state_emit_blend(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_BLENDCOL) {
+		nv10->dirty &= ~NV10_NEW_BLENDCOL;
+		nv10_state_emit_blend_color(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_RAST) {
+		nv10->dirty &= ~NV10_NEW_RAST;
+		nv10_state_emit_rast(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_DSA) {
+		nv10->dirty &= ~NV10_NEW_DSA;
+		nv10_state_emit_dsa(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_VIEWPORT) {
+		nv10->dirty &= ~NV10_NEW_VIEWPORT;
+		nv10_state_emit_viewport(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_SCISSOR) {
+		nv10->dirty &= ~NV10_NEW_SCISSOR;
+		nv10_state_emit_scissor(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_FRAMEBUFFER) {
+		nv10->dirty &= ~NV10_NEW_FRAMEBUFFER;
+		nv10_state_emit_framebuffer(nv10);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv10_vbo.c
+	 */
+
+	/* Render target */
+// XXX figre out who's who for NV10TCL_DMA_* and fill accordingly
+//	BEGIN_RING(celsius, NV10TCL_DMA_COLOR0, 1);
+//	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	if (nv10->zeta) {
+// XXX
+//		BEGIN_RING(celsius, NV10TCL_DMA_ZETA, 1);
+//		OUT_RELOCo(nv10->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(celsius, NV10TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv10->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		/* XXX for when we allocate LMA on nv17 */
+/*		BEGIN_RING(celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(nv10->zeta + lma_offset);*/
+	}
+
+	/* Vertex buffer */
+	BEGIN_RING(celsius, NV10TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	/* Texture images */
+	for (i = 0; i < 2; i++) {
+		if (!(nv10->fp_samplers & (1 << i)))
+			continue;
+		BEGIN_RING(celsius, NV10TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(nv10->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		BEGIN_RING(celsius, NV10TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(nv10->tex[i].buffer, nv10->tex[i].format,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			   NOUVEAU_BO_OR, NV10TCL_TX_FORMAT_DMA0,
+			   NV10TCL_TX_FORMAT_DMA1);
+	}
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_surface.c b/src/gallium/drivers/nv10/nv10_surface.c
new file mode 100644
index 0000000000..2538151063
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv10_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv10_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv04_surface_2d *eng2d = nv10->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv10_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv04_surface_2d *eng2d = nv10->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv10_init_surface_functions(struct nv10_context *nv10)
+{
+	nv10->pipe.surface_copy = nv10_surface_copy;
+	nv10->pipe.surface_fill = nv10_surface_fill;
+}
diff --git a/src/gallium/drivers/nv10/nv10_vbo.c b/src/gallium/drivers/nv10/nv10_vbo.c
new file mode 100644
index 0000000000..d0e788ac03
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_vbo.c
@@ -0,0 +1,77 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv10_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv10_context *nv10 = nv10_context( pipe );
+	struct draw_context *draw = nv10->draw;
+	unsigned i;
+
+	nv10_emit_hw_state(nv10);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv10->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv10->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv10->constbuf[PIPE_SHADER_VERTEX],
+					nv10->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv10->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv10->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv10->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv10_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv10_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv20/Makefile b/src/gallium/drivers/nv20/Makefile
new file mode 100644
index 0000000000..d777fd3d8b
--- /dev/null
+++ b/src/gallium/drivers/nv20/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv20
+
+DRIVER_SOURCES = \
+	nv20_clear.c \
+	nv20_context.c \
+	nv20_fragprog.c \
+	nv20_fragtex.c \
+	nv20_miptree.c \
+	nv20_prim_vbuf.c \
+	nv20_screen.c \
+	nv20_state.c \
+	nv20_state_emit.c \
+	nv20_surface.c \
+	nv20_vbo.c
+#	nv20_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv20/nv20_clear.c b/src/gallium/drivers/nv20/nv20_clear.c
new file mode 100644
index 0000000000..29f4afd87c
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv20_context.h"
+
+void
+nv20_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv20/nv20_context.c b/src/gallium/drivers/nv20/nv20_context.c
new file mode 100644
index 0000000000..1659aec8fa
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_context.c
@@ -0,0 +1,419 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static void
+nv20_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_flush(nv20->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv20_destroy(struct pipe_context *pipe)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	if (nv20->draw)
+		draw_destroy(nv20->draw);
+
+	FREE(nv20);
+}
+
+static void nv20_init_hwctx(struct nv20_context *nv20)
+{
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+	int i;
+	float projectionmatrix[16];
+	const boolean is_nv25tcl = (nv20->screen->kelvin->grclass == NV25TCL);
+
+	BEGIN_RING(kelvin, NV20TCL_DMA_NOTIFY, 1);
+	OUT_RING  (screen->sync->handle);
+	BEGIN_RING(kelvin, NV20TCL_DMA_TEXTURE0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle); /* TEXTURE1 */
+	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle); /* ZETA */
+
+	BEGIN_RING(kelvin, NV20TCL_DMA_QUERY, 1);
+	OUT_RING  (0); /* renouveau: beef0351, unique */
+
+	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  ((0xfff << 16) | 0x0);
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  ((0xfff << 16) | 0x0);
+
+	for (i = 1; i < NV20TCL_VIEWPORT_CLIP_HORIZ__SIZE; i++) {
+		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (0);
+		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_MODE, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, 0x17e0, 3);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL | 0xdb0);
+	} else {
+		BEGIN_RING(kelvin, 0x1e68, 1);
+		OUT_RING  (0x4b800000); /* 16777216.000000 */
+		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL);
+	}
+
+	BEGIN_RING(kelvin, 0x290, 1);
+	OUT_RING  ((0x10 << 16) | 1);
+	BEGIN_RING(kelvin, 0x9fc, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, 0x1d80, 1);
+	OUT_RING  (1);
+	BEGIN_RING(kelvin, 0x9f8, 1);
+	OUT_RING  (4);
+	BEGIN_RING(kelvin, 0x17ec, 3);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1d88, 1);
+		OUT_RING  (3);
+
+		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
+		OUT_RING  (nvws->channel->vram->handle);
+		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
+		OUT_RING  (nvws->channel->vram->handle);
+	}
+	BEGIN_RING(kelvin, NV20TCL_DMA_FENCE, 1);
+	OUT_RING  (0);	/* renouveau: beef1e10 */
+
+	BEGIN_RING(kelvin, 0x1e98, 1);
+	OUT_RING  (0);
+#if 0
+	if (is_nv25tcl) {
+		BEGIN_RING(NvSub3D, NV25TCL_DMA_IN_MEMORY4, 2);
+		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
+		OUT_RING  (NvDmaFB);	/* renouveau: beef0201 */
+
+		BEGIN_RING(NvSub3D, NV20TCL_DMA_TEXTURE1, 1);
+		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
+	}
+#endif
+	BEGIN_RING(kelvin, NV20TCL_NOTIFY, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, 0x120, 3);
+	OUT_RING  (0);
+	OUT_RING  (1);
+	OUT_RING  (2);
+
+/* error: ILLEGAL_MTHD, PROTECTION_FAULT
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (512.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+*/
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x022c, 2);
+		OUT_RING  (0x280);
+		OUT_RING  (0x07d28000);
+	}
+
+/* * illegal method, protection fault
+	BEGIN_RING(NvSub3D, 0x1c2c, 1);
+	OUT_RING  (0); */
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1da4, 1);
+		OUT_RING  (0);
+	}
+
+/* * crashes with illegal method, protection fault
+	BEGIN_RING(NvSub3D, 0x1c18, 1);
+	OUT_RING  (0x200); */
+
+	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  ((0 << 16) | 0);
+	OUT_RING  ((0 << 16) | 0);
+
+	/* *** Set state *** */
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (NV20TCL_ALPHA_FUNC_FUNC_ALWAYS);
+	OUT_RING  (0);			/* NV20TCL_ALPHA_FUNC_REF */
+
+	for (i = 0; i < NV20TCL_TX_ENABLE__SIZE; ++i) {
+		BEGIN_RING(kelvin, NV20TCL_TX_ENABLE(i), 1);
+		OUT_RING  (0);
+	}
+	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_OP, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_CULL_MODE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_IN_ALPHA(0), 4);
+	OUT_RING  (0x30d410d0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_OUT_RGB(0), 4);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_ENABLE, 1);
+	OUT_RING  (0x00011101);
+	BEGIN_RING(kelvin, NV20TCL_RC_FINAL0, 2);
+	OUT_RING  (0x130e0300);
+	OUT_RING  (0x0c091c80);
+	BEGIN_RING(kelvin, NV20TCL_RC_OUT_ALPHA(0), 4);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_IN_RGB(0), 4);
+	OUT_RING  (0x20c400c0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_COLOR0, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_CONSTANT_COLOR0(0), 4);
+	OUT_RING  (0x035125a0);
+	OUT_RING  (0);
+	OUT_RING  (0x40002000);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_MULTISAMPLE_CONTROL, 1);
+	OUT_RING  (0xffff0000);
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (NV20TCL_BLEND_FUNC_SRC_ONE);
+	OUT_RING  (NV20TCL_BLEND_FUNC_DST_ZERO);
+	OUT_RING  (0);			/* NV20TCL_BLEND_COLOR */
+	OUT_RING  (NV20TCL_BLEND_EQUATION_FUNC_ADD);
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RING  (0xff);
+	OUT_RING  (NV20TCL_STENCIL_FUNC_FUNC_ALWAYS);
+	OUT_RING  (0);			/* NV20TCL_STENCIL_FUNC_REF */
+	OUT_RING  (0xff);		/* NV20TCL_STENCIL_FUNC_MASK */
+	OUT_RING  (NV20TCL_STENCIL_OP_FAIL_KEEP);
+	OUT_RING  (NV20TCL_STENCIL_OP_ZFAIL_KEEP);
+	OUT_RING  (NV20TCL_STENCIL_OP_ZPASS_KEEP);
+
+	BEGIN_RING(kelvin, NV20TCL_COLOR_LOGIC_OP_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (NV20TCL_COLOR_LOGIC_OP_OP_COPY);
+	BEGIN_RING(kelvin, 0x17cc, 1);
+	OUT_RING  (0);
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1d84, 1);
+		OUT_RING  (1);
+	}
+	BEGIN_RING(kelvin, NV20TCL_LIGHTING_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_LIGHT_CONTROL, 1);
+	OUT_RING  (0x00020000);
+	BEGIN_RING(kelvin, NV20TCL_SEPARATE_SPECULAR_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_PATTERN(0),
+					NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE);
+	for (i = 0; i < NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE; ++i) {
+		OUT_RING(0xffffffff);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_LINE_ENABLE */
+	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_FILL_ENABLE */
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING  (NV20TCL_DEPTH_FUNC_LESS);
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);	/* NV20TCL.POLYGON_OFFSET_UNITS */
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING  (1);
+	if (!is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1d84, 1);
+		OUT_RING  (3);
+	}
+	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
+	if (!is_nv25tcl) {
+		OUT_RING  (8);
+	} else {
+		OUT_RINGf (1.0);
+	}
+	if (!is_nv25tcl) {
+		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 2);
+		OUT_RING  (0);
+		OUT_RING  (0);		/* NV20TCL.POINT_SMOOTH_ENABLE */
+	} else {
+		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 1);
+		OUT_RING  (0);
+		BEGIN_RING(kelvin, 0x0a1c, 1);
+		OUT_RING  (0x800);
+	}
+	BEGIN_RING(kelvin, NV20TCL_LINE_WIDTH, 1);
+	OUT_RING  (8);
+	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (NV20TCL_POLYGON_MODE_FRONT_FILL);
+	OUT_RING  (NV20TCL_POLYGON_MODE_BACK_FILL);
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (NV20TCL_CULL_FACE_BACK);
+	OUT_RING  (NV20TCL_FRONT_FACE_CCW);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 1);
+	OUT_RING  (NV20TCL_SHADE_MODEL_SMOOTH);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_TX_GEN_S(0), 4 * NV20TCL_TX_GEN_S__SIZE);
+	for (i=0; i < 4 * NV20TCL_TX_GEN_S__SIZE; ++i) {
+		OUT_RING(0);
+	}
+	BEGIN_RING(kelvin, NV20TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RINGf (1.5);
+	OUT_RINGf (-0.090168);		/* NV20TCL.FOG_EQUATION_LINEAR */
+	OUT_RINGf (0.0);		/* NV20TCL.FOG_EQUATION_QUADRATIC */
+	BEGIN_RING(kelvin, NV20TCL_FOG_MODE, 2);
+	OUT_RING  (NV20TCL_FOG_MODE_EXP_2);
+	OUT_RING  (NV20TCL_FOG_COORD_DIST_COORD_FOG);
+	BEGIN_RING(kelvin, NV20TCL_FOG_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);			/* NV20TCL.FOG_COLOR */
+	BEGIN_RING(kelvin, NV20TCL_ENGINE, 1);
+	OUT_RING  (NV20TCL_ENGINE_FIXED);
+
+	for (i = 0; i < NV20TCL_TX_MATRIX_ENABLE__SIZE; ++i) {
+		BEGIN_RING(kelvin, NV20TCL_TX_MATRIX_ENABLE(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VTX_ATTR_4F_X(1), 4 * 15);
+	OUT_RINGf(1.0); OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0);
+	OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
+	OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
+	for (i = 4; i < 16; ++i) {
+		OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(0.0);	OUT_RINGf(1.0);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (1);
+	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING (0x00010101);
+	BEGIN_RING(kelvin, NV20TCL_CLEAR_VALUE, 1);
+	OUT_RING (0);
+
+	memset(projectionmatrix, 0, sizeof(projectionmatrix));
+	projectionmatrix[0*4+0] = 1.0;
+	projectionmatrix[1*4+1] = 1.0;
+	projectionmatrix[2*4+2] = 16777215.0;
+	projectionmatrix[3*4+3] = 1.0;
+	BEGIN_RING(kelvin, NV20TCL_PROJECTION_MATRIX(0), 16);
+	for (i = 0; i < 16; i++) {
+		OUT_RINGf  (projectionmatrix[i]);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RINGf (0.0);
+	OUT_RINGf (16777216.0); /* [0, 1] scaled approx to [0, 2^24] */
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE0_X, 4);
+	OUT_RINGf (0.0); /* x-offset, w/2 + 1.031250 */
+	OUT_RINGf (0.0); /* y-offset, h/2 + 0.030762 */
+	OUT_RINGf (0.0);
+	OUT_RINGf (16777215.0);
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE1_X, 4);
+	OUT_RINGf (0.0); /* no effect?, w/2 */
+	OUT_RINGf (0.0); /* no effect?, h/2 */
+	OUT_RINGf (16777215.0 * 0.5);
+	OUT_RINGf (65535.0);
+
+	FIRE_RING (NULL);
+}
+
+static void
+nv20_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv20_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv20_screen *screen = nv20_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv20_context *nv20;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv20 = CALLOC(1, sizeof(struct nv20_context));
+	if (!nv20)
+		return NULL;
+	nv20->screen = screen;
+	nv20->pctx_id = pctx_id;
+
+	nv20->nvws = nvws;
+
+	nv20->pipe.winsys = ws;
+	nv20->pipe.screen = pscreen;
+	nv20->pipe.destroy = nv20_destroy;
+	nv20->pipe.set_edgeflags = nv20_set_edgeflags;
+	nv20->pipe.draw_arrays = nv20_draw_arrays;
+	nv20->pipe.draw_elements = nv20_draw_elements;
+	nv20->pipe.clear = nv20_clear;
+	nv20->pipe.flush = nv20_flush;
+
+	nv20_init_surface_functions(nv20);
+	nv20_init_state_functions(nv20);
+
+	nv20->draw = draw_create();
+	assert(nv20->draw);
+	draw_set_rasterize_stage(nv20->draw, nv20_draw_vbuf_stage(nv20));
+
+	nv20_init_hwctx(nv20);
+
+	return &nv20->pipe;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_context.h b/src/gallium/drivers/nv20/nv20_context.h
new file mode 100644
index 0000000000..8ad926db20
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_context.h
@@ -0,0 +1,153 @@
+#ifndef __NV20_CONTEXT_H__
+#define __NV20_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv20_screen *ctx = nv20->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv20_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV20_NEW_VERTPROG	(1 << 0)
+#define NV20_NEW_FRAGPROG	(1 << 1)
+#define NV20_NEW_VTXARRAYS	(1 << 2)
+#define NV20_NEW_BLEND		(1 << 3)
+#define NV20_NEW_BLENDCOL	(1 << 4)
+#define NV20_NEW_RAST 		(1 << 5)
+#define NV20_NEW_DSA  		(1 << 6)
+#define NV20_NEW_VIEWPORT	(1 << 7)
+#define NV20_NEW_SCISSOR	(1 << 8)
+#define NV20_NEW_FRAMEBUFFER	(1 << 9)
+
+#include "nv20_screen.h"
+
+struct nv20_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv20_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	uint32_t dirty;
+
+	struct nv20_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv20_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+	uint32_t lma_offset;
+
+	struct nv20_blend_state *blend;
+	struct pipe_blend_color *blend_color;
+	struct nv20_rasterizer_state *rast;
+	struct nv20_depth_stencil_alpha_state *dsa;
+	struct pipe_viewport_state *viewport;
+	struct pipe_scissor_state *scissor;
+	struct pipe_framebuffer_state *framebuffer;
+
+	//struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[2];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+/*	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv20_vertex_program *active;
+
+		struct nv20_vertex_program *current;
+	} vertprog;
+*/
+	struct {
+		struct nv20_fragment_program *active;
+
+		struct nv20_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+};
+
+static INLINE struct nv20_context *
+nv20_context(struct pipe_context *pipe)
+{
+	return (struct nv20_context *)pipe;
+}
+
+extern void nv20_init_state_functions(struct nv20_context *nv20);
+extern void nv20_init_surface_functions(struct nv20_context *nv20);
+
+extern void nv20_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv20_clear.c */
+extern void nv20_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv20_draw.c */
+extern struct draw_stage *nv20_draw_render_stage(struct nv20_context *nv20);
+
+/* nv20_fragprog.c */
+extern void nv20_fragprog_bind(struct nv20_context *,
+			       struct nv20_fragment_program *);
+extern void nv20_fragprog_destroy(struct nv20_context *,
+				  struct nv20_fragment_program *);
+
+/* nv20_fragtex.c */
+extern void nv20_fragtex_bind(struct nv20_context *);
+
+/* nv20_prim_vbuf.c */
+struct draw_stage *nv20_draw_vbuf_stage( struct nv20_context *nv20 );
+extern void nv20_vtxbuf_bind(struct nv20_context* nv20);
+
+/* nv20_state.c and friends */
+extern void nv20_emit_hw_state(struct nv20_context *nv20);
+extern void nv20_state_tex_update(struct nv20_context *nv20);
+
+/* nv20_vbo.c */
+extern boolean nv20_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv20_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_fragprog.c b/src/gallium/drivers/nv20/nv20_fragprog.c
new file mode 100644
index 0000000000..4f496369dd
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv20_context.h"
+
+void
+nv20_fragprog_bind(struct nv20_context *nv20, struct nv20_fragment_program *fp)
+{
+}
+
+void
+nv20_fragprog_destroy(struct nv20_context *nv20,
+		      struct nv20_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_fragtex.c b/src/gallium/drivers/nv20/nv20_fragtex.c
new file mode 100644
index 0000000000..495a7be912
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_fragtex.c
@@ -0,0 +1,124 @@
+#include "nv20_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV20TCL_TX_FORMAT_FORMAT_##tf,                                               \
+}
+
+struct nv20_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+};
+
+static struct nv20_texture_format
+nv20_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM      , L8      ),
+	_(A8_UNORM      , A8      ),
+	_(A8L8_UNORM    , A8L8    ),
+/*	_(RGB_DXT1      , DXT1,   ), */
+/*	_(RGBA_DXT1     , DXT1,   ), */
+/*	_(RGBA_DXT3     , DXT3,   ), */
+/*	_(RGBA_DXT5     , DXT5,   ), */
+	{},
+};
+
+static struct nv20_texture_format *
+nv20_fragtex_format(uint pipe_format)
+{
+	struct nv20_texture_format *tf = nv20_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+
+static void
+nv20_fragtex_build(struct nv20_context *nv20, int unit)
+{
+#if 0
+	struct nv20_sampler_state *ps = nv20->tex_sampler[unit];
+	struct nv20_miptree *nv20mt = nv20->tex_miptree[unit];
+	struct pipe_texture *pt = &nv20mt->base;
+	struct nv20_texture_format *tf;
+	uint32_t txf, txs, txp;
+
+	tf = nv20_fragtex_format(pt->format);
+	if (!tf || !tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", pt->format);
+		return;
+	}
+
+	txf  = tf->format << 8;
+	txf |= (pt->last_level + 1) << 16;
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= 8;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV10TCL_TX_FORMAT_CUBE_MAP;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= (2<<4);
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= (1<<4);
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(nv20mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv20mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (0x40000000); /* enable */
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x2000 /* magic */);
+	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
+	OUT_RING  (ps->bcol);
+#endif
+}
+
+void
+nv20_fragtex_bind(struct nv20_context *nv20)
+{
+#if 0
+	struct nv20_fragment_program *fp = nv20->fragprog.active;
+	unsigned samplers, unit;
+
+	samplers = nv20->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		BEGIN_RING(kelvin, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (0);
+	}
+
+	samplers = nv20->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		nv20_fragtex_build(nv20, unit);
+	}
+
+	nv20->fp_samplers = fp->samplers;
+#endif
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_miptree.c b/src/gallium/drivers/nv20/nv20_miptree.c
new file mode 100644
index 0000000000..ef7e9c5428
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -0,0 +1,206 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static void
+nv20_miptree_layout(struct nv20_miptree *nv20mt)
+{
+	struct pipe_texture *pt = &nv20mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else {
+		nr_faces = 1;
+	}
+	
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			nv20mt->level[l].pitch = pt->nblocksx[l] * pt->block.size;
+		else
+			nv20mt->level[l].pitch = pt->nblocksx[0] * pt->block.size;
+		nv20mt->level[l].pitch = (nv20mt->level[l].pitch + 63) & ~63;
+
+		nv20mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv20mt->level[l].image_offset[f] = offset;
+			offset += nv20mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv20mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv20_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv20_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv20_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv20_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv20_miptree *mt;
+	unsigned buf_usage = PIPE_BUFFER_USAGE_PIXEL |
+	                     NOUVEAU_BUFFER_USAGE_TEXTURE;
+
+	mt = MALLOC(sizeof(struct nv20_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	/* Swizzled textures must be POT */
+	if (pt->width[0] & (pt->width[0] - 1) ||
+	    pt->height[0] & (pt->height[0] - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
+	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+			break;
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+
+	nv20_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, buf_usage, mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv20_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv20_miptree *nv20mt = (struct nv20_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv20mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv20mt->level[l].image_offset)
+				FREE(nv20mt->level[l].image_offset);
+		}
+		FREE(nv20mt);
+	}
+}
+
+static struct pipe_surface *
+nv20_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv20_miptree *nv20mt = (struct nv20_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv20mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv20mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = nv20mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = nv20mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv20_miptree_surface_release(struct pipe_screen *pscreen,
+			     struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(ps);
+}
+
+void nv20_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv20_miptree_create;
+	pscreen->texture_blanket = nv20_miptree_blanket;
+	pscreen->texture_release = nv20_miptree_release;
+	pscreen->get_tex_surface = nv20_miptree_surface_get;
+	pscreen->tex_surface_release = nv20_miptree_surface_release;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_prim_vbuf.c b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
new file mode 100644
index 0000000000..4dd7052814
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
@@ -0,0 +1,402 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <jrfonseca@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+#include "draw/draw_vbuf.h"
+
+/**
+ * Primitive renderer for nv20.
+ */
+struct nv20_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv20_context *nv20;   
+
+	/** Vertex buffer in VRAM */
+	struct pipe_buffer *pbuffer;
+
+	/** Vertex buffer in normal memory */
+	void *mbuffer;
+
+	/** Vertex size in bytes */
+	/*unsigned vertex_size;*/
+
+	/** Hardware primitive */
+	unsigned hwprim;
+};
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv20_vbuf_render *
+nv20_vbuf_render(struct vbuf_render *render)
+{
+	assert(render);
+	return (struct nv20_vbuf_render *)render;
+}
+
+void nv20_vtxbuf_bind( struct nv20_context* nv20 )
+{
+#if 0
+	int i;
+	for(i = 0; i < NV20TCL_VTXBUF_ADDRESS__SIZE; i++) {
+		BEGIN_RING(kelvin, NV20TCL_VTXBUF_ADDRESS(i), 1);
+		OUT_RING(0/*nv20->vtxbuf*/);
+		BEGIN_RING(kelvin, NV20TCL_VTXFMT(i) ,1);
+		OUT_RING(0/*XXX*/);
+	}
+#endif
+}
+
+static const struct vertex_info *
+nv20_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+
+	nv20_emit_hw_state(nv20);
+
+	return &nv20->vertex_info;
+}
+
+static void *
+nv20__allocate_mbuffer(struct nv20_vbuf_render *nv20_render, size_t size)
+{
+	nv20_render->mbuffer = MALLOC(size);
+	return nv20_render->mbuffer;
+}
+
+static void *
+nv20__allocate_pbuffer(struct nv20_vbuf_render *nv20_render, size_t size)
+{
+	struct pipe_winsys *winsys = nv20_render->nv20->pipe.winsys;
+	nv20_render->pbuffer = winsys->buffer_create(winsys, 64,
+					PIPE_BUFFER_USAGE_VERTEX, size);
+	return winsys->buffer_map(winsys,
+			nv20_render->pbuffer,
+			PIPE_BUFFER_USAGE_CPU_WRITE);
+}
+
+static void *
+nv20_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+	void *buf;
+
+	assert(!nv20_render->pbuffer);
+	assert(!nv20_render->mbuffer);
+
+	/*
+	 * For small amount of vertices, don't bother with pipe vertex
+	 * buffer, the data will be passed directly via the fifo.
+	 */
+	/* XXX: Pipe vertex buffers don't work. */
+	if (0 && size > 16 * 1024)
+		buf = nv20__allocate_pbuffer(nv20_render, size);
+	else
+		buf = nv20__allocate_mbuffer(nv20_render, size);
+
+	if (buf)
+		nv20_render->nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	return buf;
+}
+
+static boolean
+nv20_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	unsigned hwp = nvgl_primitive(prim);
+	if (hwp == 0)
+		return FALSE;
+
+	nv20_render->hwprim = hwp;
+	return TRUE;
+}
+
+static uint32_t
+nv20__vtxhwformat(unsigned stride, unsigned fields, unsigned type)
+{
+	return (stride << NV20TCL_VTXFMT_STRIDE_SHIFT) |
+		(fields << NV20TCL_VTXFMT_SIZE_SHIFT) |
+		(type << NV20TCL_VTXFMT_TYPE_SHIFT);
+}
+
+static unsigned
+nv20__emit_format(struct nv20_context *nv20, enum attrib_emit type, int hwattr)
+{
+	uint32_t hwfmt = 0;
+	unsigned fields;
+
+	switch (type) {
+	case EMIT_OMIT:
+		hwfmt = nv20__vtxhwformat(0, 0, 2);
+		fields = 0;
+		break;
+	case EMIT_1F:
+		hwfmt = nv20__vtxhwformat(4, 1, 2);
+		fields = 1;
+		break;
+	case EMIT_2F:
+		hwfmt = nv20__vtxhwformat(8, 2, 2);
+		fields = 2;
+		break;
+	case EMIT_3F:
+		hwfmt = nv20__vtxhwformat(12, 3, 2);
+		fields = 3;
+		break;
+	case EMIT_4F:
+		hwfmt = nv20__vtxhwformat(16, 4, 2);
+		fields = 4;
+		break;
+	default:
+		NOUVEAU_ERR("unhandled attrib_emit %d\n", type);
+		return 0;
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VTXFMT(hwattr), 1);
+	OUT_RING(hwfmt);
+	return fields;
+}
+
+static unsigned
+nv20__emit_vertex_array_format(struct nv20_context *nv20)
+{
+	struct vertex_info *vinfo = &nv20->vertex_info;
+	int hwattr = NV20TCL_VTXFMT__SIZE;
+	int attr = 0;
+	unsigned nr_fields = 0;
+
+	while (hwattr-- > 0) {
+		if (vinfo->hwfmt[0] & (1 << hwattr)) {
+			nr_fields += nv20__emit_format(nv20,
+					vinfo->attrib[attr].emit, hwattr);
+			attr++;
+		} else
+			nv20__emit_format(nv20, EMIT_OMIT, hwattr);
+	}
+
+	return nr_fields;
+}
+
+static void
+nv20__draw_mbuffer(struct nv20_vbuf_render *nv20_render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_context *nv20 = nv20_render->nv20;
+	struct vertex_info *vinfo = &nv20->vertex_info;
+	unsigned nr_fields;
+	int max_push;
+	ubyte *data = nv20_render->mbuffer;
+	int vsz = 4 * vinfo->size;
+
+	nr_fields = nv20__emit_vertex_array_format(nv20);
+
+	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(nv20_render->hwprim);
+
+	max_push = 1200 / nr_fields;
+	while (nr_indices) {
+		int i;
+		int push = MIN2(nr_indices, max_push);
+
+		BEGIN_RING_NI(kelvin, NV20TCL_VERTEX_DATA, push * nr_fields);
+		for (i = 0; i < push; i++) {
+			/* XXX: fixme to handle other than floats? */
+			int f = nr_fields;
+			float *attrv = (float*)&data[indices[i] * vsz];
+			while (f-- > 0)
+				OUT_RINGf(*attrv++);
+		}
+
+		nr_indices -= push;
+		indices += push;
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(NV20TCL_VERTEX_BEGIN_END_STOP);
+}
+
+static void
+nv20__draw_pbuffer(struct nv20_vbuf_render *nv20_render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_context *nv20 = nv20_render->nv20;
+	int push, i;
+
+	NOUVEAU_ERR("nv20__draw_pbuffer: this path is broken.\n");
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(nv20_render->pbuffer, 0,
+			NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(nv20_render->hwprim);
+
+	if (nr_indices & 1) {
+		BEGIN_RING(kelvin, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (indices[0]);
+		indices++; nr_indices--;
+	}
+
+	while (nr_indices) {
+		// XXX too big/small ? check the size
+		push = MIN2(nr_indices, 1200 * 2);
+
+		BEGIN_RING_NI(kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((indices[i+1] << 16) | indices[i]);
+
+		nr_indices -= push;
+		indices  += push;
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (0);
+}
+
+static void
+nv20_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+
+	nv20_emit_hw_state(nv20_render->nv20);
+
+	if (nv20_render->pbuffer)
+		nv20__draw_pbuffer(nv20_render, indices, nr_indices);
+	else if (nv20_render->mbuffer)
+		nv20__draw_mbuffer(nv20_render, indices, nr_indices);
+	else
+		assert(0);
+}
+
+
+static void
+nv20_vbuf_render_release_vertices( struct vbuf_render *render,
+		void *vertices, 
+		unsigned vertex_size,
+		unsigned vertices_used )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+	struct pipe_winsys *winsys = nv20->pipe.winsys;
+	struct pipe_screen *pscreen = &nv20->screen->pipe;
+
+	if (nv20_render->pbuffer) {
+		winsys->buffer_unmap(winsys, nv20_render->pbuffer);
+		pipe_buffer_reference(pscreen, &nv20_render->pbuffer, NULL);
+	} else if (nv20_render->mbuffer) {
+		FREE(nv20_render->mbuffer);
+		nv20_render->mbuffer = NULL;
+	} else
+		assert(0);
+}
+
+
+static void
+nv20_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+
+	assert(!nv20_render->pbuffer);
+	assert(!nv20_render->mbuffer);
+
+	FREE(nv20_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv20_vbuf_render_create( struct nv20_context *nv20 )
+{
+	struct nv20_vbuf_render *nv20_render = CALLOC_STRUCT(nv20_vbuf_render);
+
+	nv20_render->nv20 = nv20;
+
+	nv20_render->base.max_vertex_buffer_bytes = 16*1024;
+	nv20_render->base.max_indices = 1024;
+	nv20_render->base.get_vertex_info = nv20_vbuf_render_get_vertex_info;
+	nv20_render->base.allocate_vertices =
+					nv20_vbuf_render_allocate_vertices;
+	nv20_render->base.set_primitive = nv20_vbuf_render_set_primitive;
+	nv20_render->base.draw = nv20_vbuf_render_draw;
+	nv20_render->base.release_vertices = nv20_vbuf_render_release_vertices;
+	nv20_render->base.destroy = nv20_vbuf_render_destroy;
+
+	return &nv20_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv20_draw_vbuf_stage( struct nv20_context *nv20 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv20_vbuf_render_create(nv20);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv20->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
new file mode 100644
index 0000000000..5f2b7b4f71
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -0,0 +1,222 @@
+#include "pipe/p_screen.h"
+#include "util/u_simple_screen.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static const char *
+nv20_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv20_screen *nv20screen = nv20_screen(screen);
+	struct nouveau_device *dev = nv20screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv20_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv20_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 2;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv20_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 2.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv20_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv20_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+	struct nv20_miptree *nv20mt = (struct nv20_miptree *)surface->texture;
+
+	map = ws->buffer_map(ws, nv20mt->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv20_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv20_miptree *nv20mt = (struct nv20_miptree *)surface->texture;
+
+	ws->buffer_unmap(ws, nv20mt->buffer);
+}
+
+static void
+nv20_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv20_screen *screen = nv20_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->kelvin);
+
+	FREE(pscreen);
+}
+
+static struct pipe_buffer *
+nv20_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv20_miptree *mt = (struct nv20_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+struct pipe_screen *
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv20_screen *screen = CALLOC_STRUCT(nv20_screen);
+	unsigned kelvin_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv20_surface_buffer;
+
+	/* 3D object */
+	if (chipset >= 0x25)
+		kelvin_class = NV25TCL;
+	else if (chipset >= 0x20)
+		kelvin_class = NV20TCL;
+
+	if (!kelvin_class || chipset >= 0x30) {
+		NOUVEAU_ERR("Unknown nv2x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, kelvin_class, &screen->kelvin);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv20_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv20_screen_destroy;
+
+	screen->pipe.get_name = nv20_screen_get_name;
+	screen->pipe.get_vendor = nv20_screen_get_vendor;
+	screen->pipe.get_param = nv20_screen_get_param;
+	screen->pipe.get_paramf = nv20_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv20_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv20_surface_map;
+	screen->pipe.surface_unmap = nv20_surface_unmap;
+
+	nv20_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_screen.h b/src/gallium/drivers/nv20/nv20_screen.h
new file mode 100644
index 0000000000..bf2f2c0d9f
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_screen.h
@@ -0,0 +1,24 @@
+#ifndef __NV20_SCREEN_H__
+#define __NV20_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04/nv04_surface_2d.h"
+
+struct nv20_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *kelvin;
+	struct nouveau_notifier *sync;
+};
+
+static INLINE struct nv20_screen *
+nv20_screen(struct pipe_screen *screen)
+{
+	return (struct nv20_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_state.c b/src/gallium/drivers/nv20/nv20_state.c
new file mode 100644
index 0000000000..ecec4f49a0
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state.c
@@ -0,0 +1,582 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+static void *
+nv20_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv20_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv20_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv20_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->blend = (struct nv20_blend_state*)blend;
+
+	nv20->dirty |= NV20_NEW_BLEND;
+}
+
+static void
+nv20_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV20TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV20TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV20TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV20TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV20TCL_TX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV20TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return (ret >> NV20TCL_TX_WRAP_S_SHIFT);
+}
+
+static void *
+nv20_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv20_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv20_sampler_state));
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV20TCL_TX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV20TCL_TX_WRAP_T_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy > 1.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+/*		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV20TCL_TX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV20TCL_TX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV20TCL_TX_ENABLE_ANISO_2X;
+		}*/
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV20TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV20TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |=
+				NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV20TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |=
+				NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |=
+				NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV20TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+/*	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}*/
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv20_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv20->tex_sampler[unit] = sampler[unit];
+		nv20->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv20_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv20_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv20->tex_miptree[unit] = (struct nv20_miptree *)miptree[unit];
+		nv20->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv20_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv20_rasterizer_state *rs;
+	int i;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = MALLOC(sizeof(struct nv20_rasterizer_state));
+
+	rs->templ = cso;
+	
+	rs->shade_model = cso->flatshade ? NV20TCL_SHADE_MODEL_FLAT :
+						NV20TCL_SHADE_MODEL_SMOOTH;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+
+	/* XXX: nv20 and nv25 different! */
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = NV20TCL_FRONT_FACE_CCW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = NV20TCL_FRONT_FACE_CW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CCW)
+			rs->cull_face    = NV20TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV20TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CW)
+			rs->cull_face    = NV20TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV20TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_BOTH:
+		rs->cull_face_en = 1;
+		rs->cull_face    = NV20TCL_CULL_FACE_FRONT_AND_BACK;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		rs->cull_face_en = 0;
+		rs->cull_face    = 0;
+		break;
+	}
+
+	if (cso->point_sprite) {
+		rs->point_sprite = (1 << 0);
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				rs->point_sprite |= (1 << (8 + i));
+		}
+	} else {
+		rs->point_sprite = 0;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv20_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->rast = (struct nv20_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv20->draw, (nv20->rast ? nv20->rast->templ : NULL));
+
+	nv20->dirty |= NV20_NEW_RAST;
+}
+
+static void
+nv20_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv20_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv20_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv20_depth_stencil_alpha_state));
+
+	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
+	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	hw->stencil.enable = cso->stencil[0].enabled ? 1 : 0;
+	hw->stencil.wmask = cso->stencil[0].writemask;
+	hw->stencil.func = nvgl_comparison_op(cso->stencil[0].func);
+	hw->stencil.ref	= cso->stencil[0].ref_value;
+	hw->stencil.vmask = cso->stencil[0].valuemask;
+	hw->stencil.fail = nvgl_stencil_op(cso->stencil[0].fail_op);
+	hw->stencil.zfail = nvgl_stencil_op(cso->stencil[0].zfail_op);
+	hw->stencil.zpass = nvgl_stencil_op(cso->stencil[0].zpass_op);
+
+	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
+	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
+	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref_value);
+
+	return (void *)hw;
+}
+
+static void
+nv20_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *dsa)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->dsa = (struct nv20_depth_stencil_alpha_state*)dsa;
+
+	nv20->dirty |= NV20_NEW_DSA;
+}
+
+static void
+nv20_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv20_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	return draw_create_vertex_shader(nv20->draw, templ);
+}
+
+static void
+nv20_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_bind_vertex_shader(nv20->draw, (struct draw_vertex_shader *) shader);
+
+	nv20->dirty |= NV20_NEW_VERTPROG;
+}
+
+static void
+nv20_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_delete_vertex_shader(nv20->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv20_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv20_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv20_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	
+	tgsi_scan_shader(cso->tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv20_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_fragment_program *fp = hwcso;
+
+	nv20->fragprog.current = fp;
+	nv20->dirty |= NV20_NEW_FRAGPROG;
+}
+
+static void
+nv20_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_fragment_program *fp = hwcso;
+
+	nv20_fragprog_destroy(nv20, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv20_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->blend_color = (struct pipe_blend_color*)bcol;
+
+	nv20->dirty |= NV20_NEW_BLENDCOL;
+}
+
+static void
+nv20_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_set_clip_state(nv20->draw, clip);
+}
+
+static void
+nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->buffer && buf->buffer->size &&
+                    (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv20->constbuf[shader], mapped, buf->buffer->size);
+			nv20->constbuf_nr[shader] =
+				buf->buffer->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv20_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv20->dirty |= NV20_NEW_FRAMEBUFFER;
+}
+
+static void
+nv20_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv20_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->scissor = (struct pipe_scissor_state*)s;
+
+	nv20->dirty |= NV20_NEW_SCISSOR;
+}
+
+static void
+nv20_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->viewport = (struct pipe_viewport_state*)vpt;
+
+	draw_set_viewport_state(nv20->draw, nv20->viewport);
+
+	nv20->dirty |= NV20_NEW_VIEWPORT;
+}
+
+static void
+nv20_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	memcpy(nv20->vtxbuf, vb, sizeof(*vb) * count);
+	nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv20->draw, count, vb);
+}
+
+static void
+nv20_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	memcpy(nv20->vtxelt, ve, sizeof(*ve) * count);
+	nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv20->draw, count, ve);
+}
+
+void
+nv20_init_state_functions(struct nv20_context *nv20)
+{
+	nv20->pipe.create_blend_state = nv20_blend_state_create;
+	nv20->pipe.bind_blend_state = nv20_blend_state_bind;
+	nv20->pipe.delete_blend_state = nv20_blend_state_delete;
+
+	nv20->pipe.create_sampler_state = nv20_sampler_state_create;
+	nv20->pipe.bind_sampler_states = nv20_sampler_state_bind;
+	nv20->pipe.delete_sampler_state = nv20_sampler_state_delete;
+	nv20->pipe.set_sampler_textures = nv20_set_sampler_texture;
+
+	nv20->pipe.create_rasterizer_state = nv20_rasterizer_state_create;
+	nv20->pipe.bind_rasterizer_state = nv20_rasterizer_state_bind;
+	nv20->pipe.delete_rasterizer_state = nv20_rasterizer_state_delete;
+
+	nv20->pipe.create_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_create;
+	nv20->pipe.bind_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_bind;
+	nv20->pipe.delete_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_delete;
+
+	nv20->pipe.create_vs_state = nv20_vp_state_create;
+	nv20->pipe.bind_vs_state = nv20_vp_state_bind;
+	nv20->pipe.delete_vs_state = nv20_vp_state_delete;
+
+	nv20->pipe.create_fs_state = nv20_fp_state_create;
+	nv20->pipe.bind_fs_state = nv20_fp_state_bind;
+	nv20->pipe.delete_fs_state = nv20_fp_state_delete;
+
+	nv20->pipe.set_blend_color = nv20_set_blend_color;
+	nv20->pipe.set_clip_state = nv20_set_clip_state;
+	nv20->pipe.set_constant_buffer = nv20_set_constant_buffer;
+	nv20->pipe.set_framebuffer_state = nv20_set_framebuffer_state;
+	nv20->pipe.set_polygon_stipple = nv20_set_polygon_stipple;
+	nv20->pipe.set_scissor_state = nv20_set_scissor_state;
+	nv20->pipe.set_viewport_state = nv20_set_viewport_state;
+
+	nv20->pipe.set_vertex_buffers = nv20_set_vertex_buffers;
+	nv20->pipe.set_vertex_elements = nv20_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_state.h b/src/gallium/drivers/nv20/nv20_state.h
new file mode 100644
index 0000000000..34f402fdcb
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state.h
@@ -0,0 +1,139 @@
+#ifndef __NV20_STATE_H__
+#define __NV20_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv20_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv20_sampler_state {
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv20_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+	uint32_t point_sprite;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv20_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv20_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv20_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	struct nv20_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv20_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv20_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv20_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv20_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+struct nv20_depth_stencil_alpha_state {
+	struct {
+		uint32_t func;
+		uint32_t write_enable;
+		uint32_t test_enable;
+	} depth;
+
+	struct {
+		uint32_t enable;
+		uint32_t wmask;
+		uint32_t func;
+		uint32_t ref;
+		uint32_t vmask;
+		uint32_t fail;
+		uint32_t zfail;
+		uint32_t zpass;
+	} stencil;
+
+	struct {
+		uint32_t enabled;
+		uint32_t func;
+		uint32_t ref;
+	} alpha;
+};
+
+struct nv20_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c b/src/gallium/drivers/nv20/nv20_state_emit.c
new file mode 100644
index 0000000000..0f4df9ca31
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -0,0 +1,396 @@
+#include "nv20_context.h"
+#include "nv20_state.h"
+#include "draw/draw_context.h"
+
+static void nv20_state_emit_blend(struct nv20_context* nv20)
+{
+	struct nv20_blend_state *b = nv20->blend;
+
+	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (b->d_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (b->b_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 2);
+	OUT_RING  (b->b_srcfunc);
+	OUT_RING  (b->b_dstfunc);
+
+	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING  (b->c_mask);
+}
+
+static void nv20_state_emit_blend_color(struct nv20_context* nv20)
+{
+	struct pipe_blend_color *c = nv20->blend_color;
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+		   (float_to_ubyte(c->color[0]) << 16)|
+		   (float_to_ubyte(c->color[1]) << 8) |
+		   (float_to_ubyte(c->color[2]) << 0));
+}
+
+static void nv20_state_emit_rast(struct nv20_context* nv20)
+{
+	struct nv20_rasterizer_state *r = nv20->rast;
+
+	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 2);
+	OUT_RING  (r->shade_model);
+	OUT_RING  (r->line_width);
+
+
+	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
+	OUT_RING  (r->point_size);
+
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (r->poly_mode_front);
+	OUT_RING  (r->poly_mode_back);
+
+
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (r->cull_face);
+	OUT_RING  (r->front_face);
+
+	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (r->line_smooth_en);
+	OUT_RING  (r->poly_smooth_en);
+
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (r->cull_face_en);
+}
+
+static void nv20_state_emit_dsa(struct nv20_context* nv20)
+{
+	struct nv20_depth_stencil_alpha_state *d = nv20->dsa;
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING (d->depth.func);
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (d->depth.write_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (d->depth.test_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING (1);
+
+#if 0
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING (d->stencil.enable);
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+#endif
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (d->alpha.enabled);
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (d->alpha.func);
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (d->alpha.ref);
+}
+
+static void nv20_state_emit_viewport(struct nv20_context* nv20)
+{
+}
+
+static void nv20_state_emit_scissor(struct nv20_context* nv20)
+{
+	/* NV20TCL_SCISSOR_* is probably a software method */
+/*	struct pipe_scissor_state *s = nv20->scissor;
+	BEGIN_RING(kelvin, NV20TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
+{
+	struct pipe_framebuffer_state* fb = nv20->framebuffer;
+	struct pipe_surface *rt, *zeta = NULL;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+	struct nv20_miptree *nv20mt = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV20TCL_RT_FORMAT_TYPE_LINEAR | 0x20;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV20TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV20TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (zeta) {
+		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (zeta->stride << 16));
+	} else {
+		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (rt->stride << 16));
+	}
+
+	nv20mt = (struct nv20_miptree *)rt->texture;
+	nv20->rt[0] = nv20mt->buffer;
+
+	if (zeta_format)
+	{
+		nv20mt = (struct nv20_miptree *)zeta->texture;
+		nv20->zeta = nv20mt->buffer;
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 3);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0); /*NV20TCL_RT_VERT */
+	OUT_RING  (rt_format); /* NV20TCL_RT_FORMAT */
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((w - 1) << 16) | 0);
+	OUT_RING  (((h - 1) << 16) | 0);
+}
+
+static void nv20_vertex_layout(struct nv20_context *nv20)
+{
+	struct nv20_fragment_program *fp = nv20->fragprog.current;
+	struct draw_context *dc = nv20->draw;
+	int src;
+	int i;
+	struct vertex_info *vinfo = &nv20->vertex_info;
+	const enum interp_mode colorInterp = INTERP_LINEAR;
+	boolean colors[2] = { FALSE };
+	boolean generics[12] = { FALSE };
+	boolean fog = FALSE;
+
+	memset(vinfo, 0, sizeof(*vinfo));
+
+	/*
+	 * Assumed NV20 hardware vertex attribute order:
+	 * 0 position, 1 ?, 2 ?, 3 col0,
+	 * 4 col1?, 5 ?, 6 ?, 7 ?,
+	 * 8 ?, 9 tex0, 10 tex1, 11 tex2,
+	 * 12 tex3, 13 ?, 14 ?, 15 ?
+	 * unaccounted: wgh, nor, fog
+	 * There are total 16 attrs.
+	 * vinfo->hwfmt[0] has a used-bit corresponding to each of these.
+	 * relation to TGSI_SEMANTIC_*:
+	 * - POSITION: position (always used)
+	 * - COLOR: col1, col0
+	 * - GENERIC: tex3, tex2, tex1, tex0, normal, weight
+	 * - FOG: fog
+	 */
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		int isn = fp->info.input_semantic_name[i];
+		int isi = fp->info.input_semantic_index[i];
+		switch (isn) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			assert(isi < 2);
+			colors[isi] = TRUE;
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			assert(isi < 12);
+			generics[isi] = TRUE;
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = TRUE;
+			break;
+		default:
+			assert(0 && "unknown input_semantic_name");
+		}
+	}
+
+	/* always do position */ {
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_POSITION, 0);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
+		vinfo->hwfmt[0] |= (1 << 0);
+	}
+
+	/* two unnamed generics */
+	for (i = 4; i < 6; i++) {
+		if (!generics[i])
+			continue;
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << (i - 3));
+	}
+
+	if (colors[0]) {
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 0);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+		vinfo->hwfmt[0] |= (1 << 3);
+	}
+
+	if (colors[1]) {
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 1);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+		vinfo->hwfmt[0] |= (1 << 4);
+	}
+
+	/* four unnamed generics */
+	for (i = 6; i < 10; i++) {
+		if (!generics[i])
+			continue;
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << (i - 1));
+	}
+
+	/* tex0, tex1, tex2, tex3 */
+	for (i = 0; i < 4; i++) {
+		if (!generics[i])
+			continue;
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << (i + 9));
+	}
+
+	/* two unnamed generics */
+	for (i = 10; i < 12; i++) {
+		if (!generics[i])
+			continue;
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << (i + 3));
+	}
+
+	if (fog) {
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_FOG, 0);
+		draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << 15);
+	}
+
+	draw_compute_vertex_size(vinfo);
+}
+
+void
+nv20_emit_hw_state(struct nv20_context *nv20)
+{
+	int i;
+
+	if (nv20->dirty & NV20_NEW_VERTPROG) {
+		//nv20_vertprog_bind(nv20, nv20->vertprog.current);
+		nv20->dirty &= ~NV20_NEW_VERTPROG;
+	}
+
+	if (nv20->dirty & NV20_NEW_FRAGPROG) {
+		nv20_fragprog_bind(nv20, nv20->fragprog.current);
+		/*XXX: clear NV20_NEW_FRAGPROG if no new program uploaded */
+		nv20->dirty_samplers |= (1<<10);
+		nv20->dirty_samplers = 0;
+	}
+
+	if (nv20->dirty_samplers || (nv20->dirty & NV20_NEW_FRAGPROG)) {
+		nv20_fragtex_bind(nv20);
+		nv20->dirty &= ~NV20_NEW_FRAGPROG;
+	}
+
+	if (nv20->dirty & NV20_NEW_VTXARRAYS) {
+		nv20->dirty &= ~NV20_NEW_VTXARRAYS;
+		nv20_vertex_layout(nv20);
+		nv20_vtxbuf_bind(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_BLEND) {
+		nv20->dirty &= ~NV20_NEW_BLEND;
+		nv20_state_emit_blend(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_BLENDCOL) {
+		nv20->dirty &= ~NV20_NEW_BLENDCOL;
+		nv20_state_emit_blend_color(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_RAST) {
+		nv20->dirty &= ~NV20_NEW_RAST;
+		nv20_state_emit_rast(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_DSA) {
+		nv20->dirty &= ~NV20_NEW_DSA;
+		nv20_state_emit_dsa(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_VIEWPORT) {
+		nv20->dirty &= ~NV20_NEW_VIEWPORT;
+		nv20_state_emit_viewport(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_SCISSOR) {
+		nv20->dirty &= ~NV20_NEW_SCISSOR;
+		nv20_state_emit_scissor(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_FRAMEBUFFER) {
+		nv20->dirty &= ~NV20_NEW_FRAMEBUFFER;
+		nv20_state_emit_framebuffer(nv20);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv20_vbo.c
+	 */
+
+	/* Render target */
+	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 1);
+	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	if (nv20->zeta) {
+		BEGIN_RING(kelvin, NV20TCL_DMA_ZETA, 1);
+		OUT_RELOCo(nv20->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(kelvin, NV20TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv20->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		/* XXX for when we allocate LMA on nv17 */
+/*		BEGIN_RING(kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(nv20->zeta + lma_offset);*/
+	}
+
+	/* Vertex buffer */
+	BEGIN_RING(kelvin, NV20TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	/* Texture images */
+	for (i = 0; i < 2; i++) {
+		if (!(nv20->fp_samplers & (1 << i)))
+			continue;
+		BEGIN_RING(kelvin, NV20TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(nv20->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		BEGIN_RING(kelvin, NV20TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(nv20->tex[i].buffer, nv20->tex[i].format,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			   NOUVEAU_BO_OR, NV20TCL_TX_FORMAT_DMA0,
+			   NV20TCL_TX_FORMAT_DMA1);
+	}
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_surface.c b/src/gallium/drivers/nv20/nv20_surface.c
new file mode 100644
index 0000000000..6cd607583c
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv20_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv20_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv04_surface_2d *eng2d = nv20->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv20_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv04_surface_2d *eng2d = nv20->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv20_init_surface_functions(struct nv20_context *nv20)
+{
+	nv20->pipe.surface_copy = nv20_surface_copy;
+	nv20->pipe.surface_fill = nv20_surface_fill;
+}
diff --git a/src/gallium/drivers/nv20/nv20_vbo.c b/src/gallium/drivers/nv20/nv20_vbo.c
new file mode 100644
index 0000000000..24d8f4bef0
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_vbo.c
@@ -0,0 +1,78 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv20_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv20_context *nv20 = nv20_context( pipe );
+	struct draw_context *draw = nv20->draw;
+	unsigned i;
+
+	nv20_emit_hw_state(nv20);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv20->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv20->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv20->constbuf[PIPE_SHADER_VERTEX],
+					nv20->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv20->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv20->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv20->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_flush(nv20->draw);
+	return TRUE;
+}
+
+boolean nv20_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv20_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv20/nv20_vertprog.c b/src/gallium/drivers/nv20/nv20_vertprog.c
new file mode 100644
index 0000000000..5db0e807ff
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_vertprog.c
@@ -0,0 +1,838 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  2. Arb. swz/negation
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv20_shader.h"
+
+#define swz(s,x,y,z,w) nv20_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv20_sr_neg((s))
+#define abs(s) nv20_sr_abs((s))
+
+struct nv20_vpc {
+	struct nv20_vertex_program *vp;
+
+	struct nv20_vertex_program_exec *vpi;
+
+	unsigned output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	struct nv20_sreg *imm;
+	unsigned nr_imm;
+};
+
+static struct nv20_sreg
+temp(struct nv20_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp + 1;
+	return nv20_sr(NV30SR_TEMP, idx);
+}
+
+static struct nv20_sreg
+constant(struct nv20_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	struct nv20_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv20_sr(NV30SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv20_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv20_vp_arith((cc), (s), NV30_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv20_vpc *vpc, uint32_t *hw, int pos, struct nv20_sreg src)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_TEMP:
+		sr |= (NV30_VP_SRC_REG_TYPE_TEMP << NV30_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV30SR_INPUT:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV30_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		sr |= (NV30_VP_SRC_REG_TYPE_CONST <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV30_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_VP_SRC_SWZ_W_SHIFT));
+
+/*
+ * |VVV|
+ * d�.�b
+ *  \u/
+ *
+ */
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV30_VP_SRC0_HIGH_MASK) >>
+			  NV30_VP_SRC0_HIGH_SHIFT) << NV30_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV30_VP_SRC0_LOW_MASK) <<
+			  NV30_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV30_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV30_VP_SRC2_HIGH_MASK) >>
+			  NV30_VP_SRC2_HIGH_SHIFT) << NV30_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV30_VP_SRC2_LOW_MASK) <<
+			  NV30_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv20_vpc *vpc, uint32_t *hw, int slot, struct nv20_sreg dst)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		switch (dst.index) {
+		case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV30_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+		hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+		/*XXX: no way this is entirely correct, someone needs to
+		 *     figure out what exactly it is.
+		 */
+		hw[3] |= 0x800;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv20_vp_arith(struct nv20_vpc *vpc, int slot, int op,
+	      struct nv20_sreg dst, int mask,
+	      struct nv20_sreg s0, struct nv20_sreg s1,
+	      struct nv20_sreg s2)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV30_VP_INST_COND_TR << NV30_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV30_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV30_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV30_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV30_VP_INST_COND_SWZ_W_SHIFT));
+
+	hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//	hw[3] |= NV30_VP_INST_SCA_DEST_TEMP_MASK;
+//	hw[3] |= (mask << NV30_VP_INST_VEC_WRITEMASK_SHIFT);
+
+	if (dst.type == NV30SR_OUTPUT) {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+	} else {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv20_sreg
+tgsi_src(struct nv20_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv20_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv20_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv20_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv20_sreg
+tgsi_dst(struct nv20_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv20_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = nv20_sr(NV30SR_OUTPUT,
+			      vpc->output_map[fdst->DstRegister.Index]);
+
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = nv20_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		if (vpc->high_temp < dst.index)
+			vpc->high_temp = dst.index;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv20_vertprog_parse_instruction(struct nv20_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv20_sreg src[3], dst, tmp;
+	struct nv20_sreg none = nv20_sr(NV30SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		/*XXX: index comparison is broken now that consts come from
+		 *     two different register files.
+		 */
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv20_vertprog_parse_decl_output(struct nv20_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV30_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv20_vertprog_prepare(struct nv20_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int nr_imm = 0;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv20_sreg));
+		assert(vpc->imm);
+	}
+
+	return TRUE;
+}
+
+static void
+nv20_vertprog_translate(struct nv20_context *nv20,
+			struct nv20_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv20_vpc *vpc = NULL;
+
+	tgsi_dump(vp->pipe.tokens,0);
+
+	vpc = CALLOC(1, sizeof(struct nv20_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	if (!nv20_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv20_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(imm->Immediate.NrTokens == 4 + 1);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv20_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV30_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(vpc);
+}
+
+static boolean
+nv20_vertprog_validate(struct nv20_context *nv20)
+{ 
+	struct nouveau_winsys *nvws = nv20->nvws;
+	struct pipe_winsys *ws = nv20->pipe.winsys;
+	struct nouveau_grobj *rankine = nv20->screen->rankine;
+	struct nv20_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	vp = nv20->vertprog;
+	constbuf = nv20->constbuf[PIPE_SHADER_VERTEX];
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated) {
+		nv20_vertprog_translate(nv20, vp);
+		if (!vp->translated)
+			return FALSE;
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv20->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv20_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv20->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv20_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv20_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv20_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV30_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv20_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf) {
+			ws->buffer_unmap(ws, constbuf);
+		}
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP inst %d: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, vp->insns[i].data[0], vp->insns[i].data[1],
+				vp->insns[i].data[2], vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv20->state.hw[NV30_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv20->state.hw[NV30_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv20_vertprog_destroy(struct nv20_context *nv20, struct nv20_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv20->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv20_state_entry nv20_state_vertprog = {
+	.validate = nv20_vertprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VERTPROG /*| NV30_NEW_UCP*/,
+		.hw = NV30_STATE_VERTPROG,
+	}
+};
diff --git a/src/gallium/drivers/nv30/Makefile b/src/gallium/drivers/nv30/Makefile
new file mode 100644
index 0000000000..69f2790dfe
--- /dev/null
+++ b/src/gallium/drivers/nv30/Makefile
@@ -0,0 +1,37 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv30
+
+DRIVER_SOURCES = \
+	nv30_clear.c \
+	nv30_context.c \
+	nv30_draw.c \
+	nv30_fragprog.c \
+	nv30_fragtex.c \
+	nv30_miptree.c \
+	nv30_query.c \
+	nv30_screen.c \
+	nv30_state.c \
+	nv30_state_blend.c \
+	nv30_state_emit.c \
+	nv30_state_fb.c \
+	nv30_state_rasterizer.c \
+	nv30_state_scissor.c \
+	nv30_state_stipple.c \
+	nv30_state_viewport.c \
+	nv30_state_zsa.c \
+	nv30_surface.c \
+	nv30_vbo.c \
+	nv30_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv30/nv30_clear.c b/src/gallium/drivers/nv30/nv30_clear.c
new file mode 100644
index 0000000000..8c3ca204d5
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv30_context.h"
+
+void
+nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv30/nv30_context.c b/src/gallium/drivers/nv30/nv30_context.c
new file mode 100644
index 0000000000..61654f8756
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_context.c
@@ -0,0 +1,72 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv30_context.h"
+#include "nv30_screen.h"
+
+static void
+nv30_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	FIRE_RING(fence);
+}
+
+static void
+nv30_destroy(struct pipe_context *pipe)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	if (nv30->draw)
+		draw_destroy(nv30->draw);
+	FREE(nv30);
+}
+
+struct pipe_context *
+nv30_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv30_context *nv30;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv30 = CALLOC(1, sizeof(struct nv30_context));
+	if (!nv30)
+		return NULL;
+	nv30->screen = screen;
+	nv30->pctx_id = pctx_id;
+
+	nv30->nvws = nvws;
+
+	nv30->pipe.winsys = ws;
+	nv30->pipe.screen = pscreen;
+	nv30->pipe.destroy = nv30_destroy;
+	nv30->pipe.draw_arrays = nv30_draw_arrays;
+	nv30->pipe.draw_elements = nv30_draw_elements;
+	nv30->pipe.clear = nv30_clear;
+	nv30->pipe.flush = nv30_flush;
+
+	nv30_init_query_functions(nv30);
+	nv30_init_surface_functions(nv30);
+	nv30_init_state_functions(nv30);
+
+	/* Create, configure, and install fallback swtnl path */
+	nv30->draw = draw_create();
+	draw_wide_point_threshold(nv30->draw, 9999999.0);
+	draw_wide_line_threshold(nv30->draw, 9999999.0);
+	draw_enable_line_stipple(nv30->draw, FALSE);
+	draw_enable_point_sprites(nv30->draw, FALSE);
+	draw_set_rasterize_stage(nv30->draw, nv30_draw_render_stage(nv30));
+
+	return &nv30->pipe;
+}
+	
diff --git a/src/gallium/drivers/nv30/nv30_context.h b/src/gallium/drivers/nv30/nv30_context.h
new file mode 100644
index 0000000000..b933769700
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_context.h
@@ -0,0 +1,212 @@
+#ifndef __NV30_CONTEXT_H__
+#define __NV30_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv30_screen *ctx = nv30->screen
+#include "nouveau/nouveau_push.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv30_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nv30_state_index {
+	NV30_STATE_FB = 0,
+	NV30_STATE_VIEWPORT = 1,
+	NV30_STATE_BLEND = 2,
+	NV30_STATE_RAST = 3,
+	NV30_STATE_ZSA = 4,
+	NV30_STATE_BCOL = 5,
+	NV30_STATE_CLIP = 6,
+	NV30_STATE_SCISSOR = 7,
+	NV30_STATE_STIPPLE = 8,
+	NV30_STATE_FRAGPROG = 9,
+	NV30_STATE_VERTPROG = 10,
+	NV30_STATE_FRAGTEX0 = 11,
+	NV30_STATE_FRAGTEX1 = 12,
+	NV30_STATE_FRAGTEX2 = 13,
+	NV30_STATE_FRAGTEX3 = 14,
+	NV30_STATE_FRAGTEX4 = 15,
+	NV30_STATE_FRAGTEX5 = 16,
+	NV30_STATE_FRAGTEX6 = 17,
+	NV30_STATE_FRAGTEX7 = 18,
+	NV30_STATE_FRAGTEX8 = 19,
+	NV30_STATE_FRAGTEX9 = 20,
+	NV30_STATE_FRAGTEX10 = 21,
+	NV30_STATE_FRAGTEX11 = 22,
+	NV30_STATE_FRAGTEX12 = 23,
+	NV30_STATE_FRAGTEX13 = 24,
+	NV30_STATE_FRAGTEX14 = 25,
+	NV30_STATE_FRAGTEX15 = 26,
+	NV30_STATE_VERTTEX0 = 27,
+	NV30_STATE_VERTTEX1 = 28,
+	NV30_STATE_VERTTEX2 = 29,
+	NV30_STATE_VERTTEX3 = 30,
+	NV30_STATE_VTXBUF = 31,
+	NV30_STATE_VTXFMT = 32,
+	NV30_STATE_VTXATTR = 33,
+	NV30_STATE_MAX = 34
+};
+
+#include "nv30_screen.h"
+
+#define NV30_NEW_BLEND		(1 <<  0)
+#define NV30_NEW_RAST		(1 <<  1)
+#define NV30_NEW_ZSA		(1 <<  2)
+#define NV30_NEW_SAMPLER	(1 <<  3)
+#define NV30_NEW_FB		(1 <<  4)
+#define NV30_NEW_STIPPLE	(1 <<  5)
+#define NV30_NEW_SCISSOR	(1 <<  6)
+#define NV30_NEW_VIEWPORT	(1 <<  7)
+#define NV30_NEW_BCOL		(1 <<  8)
+#define NV30_NEW_VERTPROG	(1 <<  9)
+#define NV30_NEW_FRAGPROG	(1 << 10)
+#define NV30_NEW_ARRAYS		(1 << 11)
+#define NV30_NEW_UCP		(1 << 12)
+
+struct nv30_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nv30_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned viewport_bypass;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NV30_STATE_MAX];
+};
+
+struct nv30_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv30_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nv30_state state;
+
+	/* Context state */
+	unsigned dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct nv30_vertex_program *vertprog;
+	struct nv30_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nv30_rasterizer_state *rasterizer;
+	struct nv30_zsa_state *zsa;
+	struct nv30_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nv30_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv30_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	const unsigned *edgeflags;
+};
+
+static INLINE struct nv30_context *
+nv30_context(struct pipe_context *pipe)
+{
+	return (struct nv30_context *)pipe;
+}
+
+struct nv30_state_entry {
+	boolean (*validate)(struct nv30_context *nv30);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern void nv30_init_state_functions(struct nv30_context *nv30);
+extern void nv30_init_surface_functions(struct nv30_context *nv30);
+extern void nv30_init_query_functions(struct nv30_context *nv30);
+
+extern void nv30_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv30_draw.c */
+extern struct draw_stage *nv30_draw_render_stage(struct nv30_context *nv30);
+
+/* nv30_vertprog.c */
+extern void nv30_vertprog_destroy(struct nv30_context *,
+				  struct nv30_vertex_program *);
+
+/* nv30_fragprog.c */
+extern void nv30_fragprog_destroy(struct nv30_context *,
+				  struct nv30_fragment_program *);
+
+/* nv30_fragtex.c */
+extern void nv30_fragtex_bind(struct nv30_context *);
+
+/* nv30_state.c and friends */
+extern boolean nv30_state_validate(struct nv30_context *nv30);
+extern void nv30_state_emit(struct nv30_context *nv30);
+extern struct nv30_state_entry nv30_state_rasterizer;
+extern struct nv30_state_entry nv30_state_scissor;
+extern struct nv30_state_entry nv30_state_stipple;
+extern struct nv30_state_entry nv30_state_fragprog;
+extern struct nv30_state_entry nv30_state_vertprog;
+extern struct nv30_state_entry nv30_state_blend;
+extern struct nv30_state_entry nv30_state_blend_colour;
+extern struct nv30_state_entry nv30_state_zsa;
+extern struct nv30_state_entry nv30_state_viewport;
+extern struct nv30_state_entry nv30_state_framebuffer;
+extern struct nv30_state_entry nv30_state_fragtex;
+extern struct nv30_state_entry nv30_state_vbo;
+
+/* nv30_vbo.c */
+extern boolean nv30_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv30_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv30_clear.c */
+extern void nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_draw.c b/src/gallium/drivers/nv30/nv30_draw.c
new file mode 100644
index 0000000000..74fc138c05
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_draw.c
@@ -0,0 +1,61 @@
+#include "draw/draw_pipe.h"
+
+#include "nv30_context.h"
+
+struct nv30_draw_stage {
+	struct draw_stage draw;
+	struct nv30_context *nv30;
+};
+
+static void
+nv30_draw_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_flush(struct draw_stage *draw, unsigned flags)
+{
+}
+
+static void
+nv30_draw_reset_stipple_counter(struct draw_stage *draw)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+struct draw_stage *
+nv30_draw_render_stage(struct nv30_context *nv30)
+{
+	struct nv30_draw_stage *nv30draw = CALLOC_STRUCT(nv30_draw_stage);
+
+	nv30draw->nv30 = nv30;
+	nv30draw->draw.draw = nv30->draw;
+	nv30draw->draw.point = nv30_draw_point;
+	nv30draw->draw.line = nv30_draw_line;
+	nv30draw->draw.tri = nv30_draw_tri;
+	nv30draw->draw.flush = nv30_draw_flush;
+	nv30draw->draw.reset_stipple_counter = nv30_draw_reset_stipple_counter;
+	nv30draw->draw.destroy = nv30_draw_destroy;
+
+	return &nv30draw->draw;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
new file mode 100644
index 0000000000..320ba3f4bf
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -0,0 +1,911 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv30_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV30_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV30_FP_OP_COND_TR
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+#define scale(s,v) nv30_sr_scale((s), NV30_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv30_fpc {
+	struct nv30_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+	int num_regs;
+
+	uint depth_id;
+	uint colour_id;
+
+	unsigned inst_offset;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv30_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv30_sreg
+temp(struct nv30_fpc *fpc)
+{
+	int idx;
+
+	idx  = fpc->temp_temp_count++;
+	idx += fpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static INLINE struct nv30_sreg
+constant(struct nv30_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_fp_arith((cc), (s), NV30_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv30_fp_tex((cc), (s), NV30_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv30_fpc *fpc, int size)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv30_fpc *fpc, int pos, struct nv30_sreg src)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_INPUT:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV30_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		sr |= NV30_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV30SR_TEMP:
+		sr |= (NV30_FP_REG_TYPE_TEMP << NV30_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_FP_REG_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		grow_insns(fpc, 4);
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv30_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV30_FP_REG_TYPE_CONST << NV30_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV30_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv30_fpc *fpc, struct nv30_sreg dst)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV30SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV30_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV30SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV30_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv30_fp_arith(struct nv30_fpc *fpc, int sat, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV30_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
+	hw[0] |= (op << NV30_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV30_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV30_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV30_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV30_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV30_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV30_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV30_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV30_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV30_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv30_fp_tex(struct nv30_fpc *fpc, int sat, int op, int unit,
+	    struct nv30_sreg dst, int mask,
+	    struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	nv30_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV30_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index + 1);
+		if (fpc->high_temp < src.index)
+			fpc->high_temp = src.index;
+		break;
+	/* This is clearly insane, but gallium hands us shaders like this.
+	 * Luckily fragprog results are just temp regs..
+	 */
+	case TGSI_FILE_OUTPUT:
+		if (fsrc->SrcRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	int idx;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		if (fdst->DstRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	case TGSI_FILE_TEMPORARY:
+		idx = fdst->DstRegister.Index + 1;
+		if (fpc->high_temp < idx)
+			fpc->high_temp = idx;
+		return nv30_sr(NV30SR_TEMP, idx);
+	case TGSI_FILE_NULL:
+		return nv30_sr(NV30SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv30_sr(NV30SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv30_sreg *src)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv30_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg src[3], dst, tmp;
+	int mask, sat, unit = 0;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	fpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				NOUVEAU_MSG("extra src attr %d\n",
+					 fsrc->SrcRegister.Index);
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV30_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nv30_sr(NV30SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV30_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		arith(fpc, sat, LRP, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_NOISE1:
+	case TGSI_OPCODE_NOISE2:
+	case TGSI_OPCODE_NOISE3:
+	case TGSI_OPCODE_NOISE4:
+		arith(fpc, sat, SFL, dst, mask, none, none, none);
+		break;
+	case TGSI_OPCODE_POW:
+		arith(fpc, sat, POW, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		arith(fpc, 0, RFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(fpc, sat, RSQ, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_attrib(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_output(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		fpc->depth_id = fdec->DeclarationRange.First;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		fpc->colour_id = fdec->DeclarationRange.First;
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_prepare(struct nv30_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	/*int high_temp = -1, i;*/
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv30_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			/*case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;*/
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	/*if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nv30_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}*/
+
+	return TRUE;
+
+out_err:
+	/*if (fpc->r_temp)
+		FREE(fpc->r_temp);*/
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nv30_fragprog_translate(struct nv30_context *nv30,
+			struct nv30_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_fpc *fpc = NULL;
+
+	tgsi_dump(fp->pipe.tokens,0);
+
+	fpc = CALLOC(1, sizeof(struct nv30_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->high_temp = -1;
+	fpc->num_regs = 2;
+
+	if (!nv30_fragprog_prepare(fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= (fpc->num_regs-1)/2;
+	fp->fp_reg_control = (1<<16)|0x4;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+	fp->on_hw = FALSE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(fpc);
+}
+
+static void
+nv30_fragprog_upload(struct nv30_context *nv30,
+		     struct nv30_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	ws->buffer_unmap(ws, fp->buffer);
+}
+
+static boolean
+nv30_fragprog_validate(struct nv30_context *nv30)
+{
+	struct nv30_fragment_program *fp = nv30->fragprog;
+	struct pipe_buffer *constbuf =
+		nv30->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	/*nv30->fallback_swrast &= ~NV30_NEW_FRAGPROG;*/
+	nv30_fragprog_translate(nv30, fp);
+	if (!fp->translated) {
+		/*nv30->fallback_swrast |= NV30_NEW_FRAGPROG;*/
+		return FALSE;
+	}
+
+	fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
+	nv30_fragprog_upload(nv30, fp);
+
+	so = so_new(8, 1);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_ACTIVE_PROGRAM, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV34TCL_FP_ACTIVE_PROGRAM_DMA0, NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_REG_CONTROL, 1);
+	so_data  (so, fp->fp_reg_control);
+	so_method(so, nv30->screen->rankine, NV34TCL_TX_UNITS_ENABLE, 1);
+	so_data  (so, fp->samplers);
+	so_ref(so, &fp->so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+		
+		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv30_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		ws->buffer_unmap(ws, constbuf);
+
+		if (new_consts)
+			nv30_fragprog_upload(nv30, fp);
+	}
+
+	if (new_consts || fp->so != nv30->state.hw[NV30_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nv30->state.hw[NV30_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv30_fragprog_destroy(struct nv30_context *nv30,
+		      struct nv30_fragment_program *fp)
+{
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nv30_state_entry nv30_state_fragprog = {
+	.validate = nv30_fragprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_FRAGPROG,
+		.hw = NV30_STATE_FRAGPROG
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
new file mode 100644
index 0000000000..b1d2663af3
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -0,0 +1,163 @@
+#include "nv30_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w)            \
+}
+
+struct nv30_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+};
+
+static struct nv30_texture_format
+nv30_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
+	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
+//	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X),
+//	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	{},
+};
+
+static struct nv30_texture_format *
+nv30_fragtex_format(uint pipe_format)
+{
+	struct nv30_texture_format *tf = nv30_texture_formats;
+	char fs[128];
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return NULL;
+}
+
+
+static struct nouveau_stateobj *
+nv30_fragtex_build(struct nv30_context *nv30, int unit)
+{
+	struct nv30_sampler_state *ps = nv30->tex_sampler[unit];
+	struct nv30_miptree *nv30mt = nv30->tex_miptree[unit];
+	struct pipe_texture *pt = &nv30mt->base;
+	struct nv30_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs , txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv30_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = tf->format;
+	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv30mt->level[0].pitch;
+		txf |= (1<<13) /*FIXME: NV34TCL_TX_FORMAT_LINEAR ? */;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(16, 2);
+	so_method(so, nv30->screen->rankine, NV34TCL_TX_OFFSET(unit), 8);
+	so_reloc (so, nv30mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv30mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV34TCL_TX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+
+	return so;
+}
+
+static boolean
+nv30_fragtex_validate(struct nv30_context *nv30)
+{
+	struct nv30_fragment_program *fp = nv30->fragprog;
+	struct nv30_state *state = &nv30->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(2, 0);
+		so_method(so, nv30->screen->rankine, NV34TCL_TX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV30_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nv30->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = nv30_fragtex_build(nv30, unit);
+		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV30_STATE_FRAGTEX0 + unit));
+	}
+
+	nv30->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nv30_state_entry nv30_state_fragtex = {
+	.validate = nv30_fragtex_validate,
+	.dirty = {
+		.pipe = NV30_NEW_SAMPLER | NV30_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_miptree.c b/src/gallium/drivers/nv30/nv30_miptree.c
new file mode 100644
index 0000000000..fe13f50ebb
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_miptree.c
@@ -0,0 +1,234 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv30_context.h"
+
+static void
+nv30_miptree_layout(struct nv30_miptree *nv30mt)
+{
+	struct pipe_texture *pt = &nv30mt->base;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+	uint wide_pitch = pt->tex_usage & (PIPE_TEXTURE_USAGE_SAMPLER |
+		                           PIPE_TEXTURE_USAGE_DEPTH_STENCIL |
+		                           PIPE_TEXTURE_USAGE_RENDER_TARGET |
+		                           PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+		                           PIPE_TEXTURE_USAGE_PRIMARY);
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (wide_pitch && (pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
+			nv30mt->level[l].pitch = align(pt->width[0] * pt->block.size, 64);
+		else
+			nv30mt->level[l].pitch = pt->width[l] * pt->block.size;
+
+		nv30mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l < pt->last_level; l++) {
+			nv30mt->level[l].image_offset[f] = offset;
+
+			if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
+				offset += align(nv30mt->level[l].pitch * pt->height[l], 64);
+			else
+				offset += nv30mt->level[l].pitch * pt->height[l];
+		}
+
+		nv30mt->level[l].image_offset[f] = offset;
+		offset += nv30mt->level[l].pitch * pt->height[l];
+	}
+
+	nv30mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv30_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv30_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv30_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->shadow_tex = NULL;
+	mt->shadow_surface = NULL;
+
+	/* Swizzled textures must be POT */
+	if (pt->width[0] & (pt->width[0] - 1) ||
+	    pt->height[0] & (pt->height[0] - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
++	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
++	                     PIPE_TEXTURE_USAGE_DEPTH_STENCIL))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		{
+			if (debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE))
+				mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+ 			break;
+		}
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	nv30_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256,
+				       PIPE_BUFFER_USAGE_PIXEL |
+				       NOUVEAU_BUFFER_USAGE_TEXTURE,
+				       mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv30_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv30_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv30_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static void
+nv30_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv30_miptree *mt = (struct nv30_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	if (mt->shadow_tex) {
+		if (mt->shadow_surface)
+			pscreen->tex_surface_release(pscreen, &mt->shadow_surface);
+		nv30_miptree_release(pscreen, &mt->shadow_tex);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv30_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv30_miptree *nv30mt = (struct nv30_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv30mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv30mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = nv30mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = nv30mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv30_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(ps);
+}
+
+void
+nv30_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv30_miptree_create;
+	pscreen->texture_blanket = nv30_miptree_blanket;
+	pscreen->texture_release = nv30_miptree_release;
+	pscreen->get_tex_surface = nv30_miptree_surface_new;
+	pscreen->tex_surface_release = nv30_miptree_surface_del;
+}
diff --git a/src/gallium/drivers/nv30/nv30_query.c b/src/gallium/drivers/nv30/nv30_query.c
new file mode 100644
index 0000000000..2f974cf5c4
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -0,0 +1,122 @@
+#include "pipe/p_context.h"
+
+#include "nv30_context.h"
+
+struct nv30_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv30_query *
+nv30_query(struct pipe_query *pipe)
+{
+	return (struct nv30_query *)pipe;
+}
+
+static struct pipe_query *
+nv30_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv30_query *q;
+
+	q = CALLOC(1, sizeof(struct nv30_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv30_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	if (q->object)
+		nv30->nvws->res_free(&q->object);
+	FREE(q);
+}
+
+static void
+nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64_t tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nv30->nvws->res_alloc(nv30->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv30->nvws->notifier_reset(nv30->screen->query, q->object->start);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(rankine, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(NULL);
+}
+
+static boolean
+nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64_t *result)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv30->screen->query,
+					       q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv30->screen->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv30->screen->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv30_init_query_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_query = nv30_query_create;
+	nv30->pipe.destroy_query = nv30_query_destroy;
+	nv30->pipe.begin_query = nv30_query_begin;
+	nv30->pipe.end_query = nv30_query_end;
+	nv30->pipe.get_query_result = nv30_query_result;
+}
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
new file mode 100644
index 0000000000..c97a73f0b1
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -0,0 +1,401 @@
+#include "pipe/p_screen.h"
+#include "util/u_simple_screen.h"
+
+#include "nv30_context.h"
+#include "nv30_screen.h"
+
+#define NV30TCL_CHIPSET_3X_MASK 0x00000003
+#define NV34TCL_CHIPSET_3X_MASK 0x00000010
+#define NV35TCL_CHIPSET_3X_MASK 0x000001e0
+
+static const char *
+nv30_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv30_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv30_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 2;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return 0;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 1;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv30_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 8.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv30_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static struct pipe_buffer *
+nv30_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv30_miptree *mt = (struct nv30_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+static void *
+nv30_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_map;
+	void			*map;
+
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv30_miptree *mt = (struct nv30_miptree *)surface->texture;
+
+		if (!mt->shadow_tex) {
+			unsigned old_tex_usage = surface->texture->tex_usage;
+			surface->texture->tex_usage = NOUVEAU_TEXTURE_USAGE_LINEAR |
+			                              PIPE_TEXTURE_USAGE_DYNAMIC;
+			mt->shadow_tex = screen->texture_create(screen, surface->texture);
+			surface->texture->tex_usage = old_tex_usage;
+
+			assert(mt->shadow_tex->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR);
+		}
+
+		mt->shadow_surface = screen->get_tex_surface
+		(
+			screen, mt->shadow_tex,
+			surface->face, surface->level, surface->zslice,
+			surface->usage
+		);
+
+		surface_to_map = mt->shadow_surface;
+	}
+	else
+		surface_to_map = surface;
+
+	assert(surface_to_map);
+
+	map = ws->buffer_map(ws, nv30_surface_buffer(surface_to_map), flags);
+	if (!map)
+		return NULL;
+
+	return map + surface_to_map->offset;
+}
+
+static void
+nv30_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_unmap;
+
+	/* TODO: Copy from shadow just before push buffer is flushed instead.
+	         There are probably some programs that map/unmap excessively
+	         before rendering. */
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv30_miptree *mt = (struct nv30_miptree *)surface->texture;
+
+		assert(mt->shadow_tex);
+
+		surface_to_unmap = mt->shadow_surface;
+	}
+	else
+		surface_to_unmap = surface;
+
+	assert(surface_to_unmap);
+
+	ws->buffer_unmap(ws, nv30_surface_buffer(surface_to_unmap));
+
+	if (surface_to_unmap != surface) {
+		struct nv30_screen *nvscreen = nv30_screen(screen);
+
+		nvscreen->eng2d->copy(nvscreen->eng2d, surface, 0, 0,
+		                      surface_to_unmap, 0, 0,
+		                      surface->width, surface->height);
+
+		screen->tex_surface_release(screen, &surface_to_unmap);
+	}
+}
+
+static void
+nv30_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->res_free(&screen->vp_exec_heap);
+	nvws->res_free(&screen->vp_data_heap);
+	nvws->res_free(&screen->query_heap);
+	nvws->notifier_free(&screen->query);
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->rankine);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen);
+	struct nouveau_stateobj *so;
+	unsigned rankine_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret, i;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv30_surface_buffer;
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x30:
+		if (NV30TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0397;
+		else
+		if (NV34TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0697;
+		else
+		if (NV35TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0497;
+		break;
+	default:
+		break;
+	}
+
+	if (!rankine_class) {
+		NOUVEAU_ERR("Unknown nv3x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, rankine_class, &screen->rankine);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&screen->vp_exec_heap, 0, 256) ||
+	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static rankine initialisation */
+	so = so_new(128, 0);
+	so_method(so, screen->rankine, NV34TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+/*	so_method(so, screen->rankine, NV34TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);*/
+	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY7, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY8, 1);
+	so_data  (so, nvws->channel->vram->handle);
+
+	for (i=1; i<8; i++) {
+		so_method(so, screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		so_data  (so, 0);
+		so_method(so, screen->rankine, NV34TCL_VIEWPORT_CLIP_VERT(i), 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, screen->rankine, 0x220, 1);
+	so_data  (so, 1);
+
+	so_method(so, screen->rankine, 0x03b0, 1);
+	so_data  (so, 0x00100000);
+	so_method(so, screen->rankine, 0x1454, 1);
+	so_data  (so, 0);
+	so_method(so, screen->rankine, 0x1d80, 1);
+	so_data  (so, 3);
+	so_method(so, screen->rankine, 0x1450, 1);
+	so_data  (so, 0x00030004);
+
+	/* NEW */
+	so_method(so, screen->rankine, 0x1e98, 1);
+	so_data  (so, 0);
+	so_method(so, screen->rankine, 0x17e0, 3);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+	so_method(so, screen->rankine, 0x1f80, 16);
+	for (i=0; i<16; i++) {
+		so_data  (so, (i==8) ? 0x0000ffff : 0);
+	}
+
+	so_method(so, screen->rankine, 0x120, 3);
+	so_data  (so, 0);
+	so_data  (so, 1);
+	so_data  (so, 2);
+
+	so_method(so, screen->rankine, 0x1d88, 1);
+	so_data  (so, 0x00001200);
+
+	so_method(so, screen->rankine, NV34TCL_RC_ENABLE, 1);
+	so_data  (so, 0);
+
+	so_method(so, screen->rankine, NV34TCL_DEPTH_RANGE_NEAR, 2);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+
+	so_method(so, screen->rankine, NV34TCL_MULTISAMPLE_CONTROL, 1);
+	so_data  (so, 0xffff0000);
+
+	/* enables use of vp rather than fixed-function somehow */
+	so_method(so, screen->rankine, 0x1e94, 1);
+	so_data  (so, 0x13);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv30_screen_destroy;
+
+	screen->pipe.get_name = nv30_screen_get_name;
+	screen->pipe.get_vendor = nv30_screen_get_vendor;
+	screen->pipe.get_param = nv30_screen_get_param;
+	screen->pipe.get_paramf = nv30_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv30_screen_surface_format_supported;
+
+	screen->pipe.surface_map = nv30_surface_map;
+	screen->pipe.surface_unmap = nv30_surface_unmap;
+
+	nv30_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
diff --git a/src/gallium/drivers/nv30/nv30_screen.h b/src/gallium/drivers/nv30/nv30_screen.h
new file mode 100644
index 0000000000..b11e470f94
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_screen.h
@@ -0,0 +1,37 @@
+#ifndef __NV30_SCREEN_H__
+#define __NV30_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04/nv04_surface_2d.h"
+
+struct nv30_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *rankine;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NV30_STATE_MAX];
+};
+
+static INLINE struct nv30_screen *
+nv30_screen(struct pipe_screen *screen)
+{
+	return (struct nv30_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_shader.h b/src/gallium/drivers/nv30/nv30_shader.h
new file mode 100644
index 0000000000..dd3a36f78f
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_shader.h
@@ -0,0 +1,490 @@
+#ifndef __NV30_SHADER_H__
+#define __NV30_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * 128bit opcodes, split into 4 32-bit ones for ease of use.
+ *
+ * Non-native instructions
+ *   ABS - MOV + NV40_VP_INST0_DEST_ABS
+ *   POW - EX2 + MUL + LG2
+ *   SUB - ADD, second source negated
+ *   SWZ - MOV
+ *   XPD -  
+ *
+ * Register access
+ *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
+ *   - Only one CONST can be accessed per-instruction (move extras into TEMPs)
+ *
+ * Relative Addressing
+ *   According to the value returned for
+ *   MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB
+ *
+ *   there are only two address registers available.  The destination in the
+ *   ARL instruction is set to TEMP <n> (The temp isn't actually written).
+ *
+ *   When using vanilla ARB_v_p, the proprietary driver will squish both the
+ *   available ADDRESS regs into the first hardware reg in the X and Y
+ *   components.
+ *
+ *   To use an address reg as an index into consts, the CONST_SRC is set to
+ *   (const_base + offset) and INDEX_CONST is set.
+ *
+ *   To access the second address reg use ADDR_REG_SELECT_1. A particular
+ *   component of the address regs is selected with ADDR_SWZ.
+ *
+ *   Only one address register can be accessed per instruction.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details) Conditional
+ * execution of an instruction is enabled by setting COND_TEST_ENABLE, and
+ * selecting the condition which will allow the test to pass with
+ * COND_{FL,LT,...}.  It is possible to swizzle the values in the condition
+ * register, which allows for testing against an individual component.
+ *
+ * Branching:
+ *
+ *   The BRA/CAL instructions seem to follow a slightly different opcode
+ *   layout.  The destination instruction ID (IADDR) overlaps a source field.
+ *   Instruction ID's seem to be numbered based on the UPLOAD_FROM_ID FIFO
+ *   command, and is incremented automatically on each UPLOAD_INST FIFO
+ *   command.
+ *
+ *   Conditional branching is achieved by using the condition tests described
+ *   above.  There doesn't appear to be dedicated looping instructions, but
+ *   this can be done using a temp reg + conditional branching.
+ *
+ *   Subroutines may be uploaded before the main program itself, but the first
+ *   executed instruction is determined by the PROGRAM_START_ID FIFO command.
+ *
+ */
+
+/* DWORD 0 */
+
+#define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
+#define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
+#define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
+#define NV30_VP_INST_SRC0_ABS           (1 << 21) /* guess */
+#define NV30_VP_INST_VEC_RESULT         (1 << 20)
+#define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
+#define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
+#define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
+#define NV30_VP_INST_COND_SHIFT          11
+#define NV30_VP_INST_COND_MASK          (0x07 << 11)
+#  define NV30_VP_INST_COND_FL  0 /* guess */  
+#  define NV30_VP_INST_COND_LT  1  
+#  define NV30_VP_INST_COND_EQ  2
+#  define NV30_VP_INST_COND_LE  3
+#  define NV30_VP_INST_COND_GT  4
+#  define NV30_VP_INST_COND_NE  5
+#  define NV30_VP_INST_COND_GE  6
+#  define NV30_VP_INST_COND_TR  7 /* guess */
+#define NV30_VP_INST_COND_SWZ_X_SHIFT        9
+#define NV30_VP_INST_COND_SWZ_X_MASK        (0x03 <<  9)
+#define NV30_VP_INST_COND_SWZ_Y_SHIFT        7
+#define NV30_VP_INST_COND_SWZ_Y_MASK        (0x03 <<  7)
+#define NV30_VP_INST_COND_SWZ_Z_SHIFT        5
+#define NV30_VP_INST_COND_SWZ_Z_MASK        (0x03 <<  5)
+#define NV30_VP_INST_COND_SWZ_W_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_W_MASK        (0x03 <<  3)
+#define NV30_VP_INST_COND_SWZ_ALL_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_ALL_MASK        (0xFF <<  3)
+#define NV30_VP_INST_ADDR_SWZ_SHIFT        1
+#define NV30_VP_INST_ADDR_SWZ_MASK        (0x03 <<  1)
+#define NV30_VP_INST_SCA_OPCODEH_SHIFT        0
+#define NV30_VP_INST_SCA_OPCODEH_MASK        (0x01 <<  0)
+
+/* DWORD 1 */
+#define NV30_VP_INST_SCA_OPCODEL_SHIFT        28
+#define NV30_VP_INST_SCA_OPCODEL_MASK        (0x0F << 28)
+#  define NV30_VP_INST_OP_NOP  0x00
+#  define NV30_VP_INST_OP_RCP  0x02
+#  define NV30_VP_INST_OP_RCC  0x03
+#  define NV30_VP_INST_OP_RSQ  0x04
+#  define NV30_VP_INST_OP_EXP  0x05
+#  define NV30_VP_INST_OP_LOG  0x06
+#  define NV30_VP_INST_OP_LIT  0x07
+#  define NV30_VP_INST_OP_BRA  0x09
+#  define NV30_VP_INST_OP_CAL  0x0B
+#  define NV30_VP_INST_OP_RET  0x0C
+#  define NV30_VP_INST_OP_LG2  0x0D
+#  define NV30_VP_INST_OP_EX2  0x0E
+#  define NV30_VP_INST_OP_SIN  0x0F
+#  define NV30_VP_INST_OP_COS  0x10
+#define NV30_VP_INST_VEC_OPCODE_SHIFT        23
+#define NV30_VP_INST_VEC_OPCODE_MASK        (0x1F << 23)
+#  define NV30_VP_INST_OP_NOPV  0x00
+#  define NV30_VP_INST_OP_MOV  0x01
+#  define NV30_VP_INST_OP_MUL  0x02
+#  define NV30_VP_INST_OP_ADD  0x03
+#  define NV30_VP_INST_OP_MAD  0x04
+#  define NV30_VP_INST_OP_DP3  0x05
+#  define NV30_VP_INST_OP_DP4  0x07
+#  define NV30_VP_INST_OP_DPH  0x06
+#  define NV30_VP_INST_OP_DST  0x08
+#  define NV30_VP_INST_OP_MIN  0x09
+#  define NV30_VP_INST_OP_MAX  0x0A
+#  define NV30_VP_INST_OP_SLT  0x0B
+#  define NV30_VP_INST_OP_SGE  0x0C
+#  define NV30_VP_INST_OP_ARL  0x0D
+#  define NV30_VP_INST_OP_FRC  0x0E
+#  define NV30_VP_INST_OP_FLR  0x0F
+#  define NV30_VP_INST_OP_SEQ  0x10
+#  define NV30_VP_INST_OP_SFL  0x11
+#  define NV30_VP_INST_OP_SGT  0x12
+#  define NV30_VP_INST_OP_SLE  0x13
+#  define NV30_VP_INST_OP_SNE  0x14
+#  define NV30_VP_INST_OP_STR  0x15
+#  define NV30_VP_INST_OP_SSG  0x16
+#  define NV30_VP_INST_OP_ARR  0x17
+#  define NV30_VP_INST_OP_ARA  0x18
+#define NV30_VP_INST_CONST_SRC_SHIFT        14
+#define NV30_VP_INST_CONST_SRC_MASK        (0xFF << 14)
+#define NV30_VP_INST_INPUT_SRC_SHIFT        9    /*NV20*/
+#define NV30_VP_INST_INPUT_SRC_MASK        (0x0F <<  9)  /*NV20*/
+#  define NV30_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
+#  define NV30_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
+#  define NV30_VP_INST_IN_NORMAL  2    
+#  define NV30_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
+#  define NV30_VP_INST_IN_COL1  4
+#  define NV30_VP_INST_IN_FOGC  5
+#  define NV30_VP_INST_IN_TC0  8
+#  define NV30_VP_INST_IN_TC(n)  (8+n)
+#define NV30_VP_INST_SRC0H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC0H_MASK          (0x1FF << 0)  /*NV20*/
+
+/* Please note: the IADDR fields overlap other fields because they are used
+ * only for branch instructions.  See Branching: label above
+ *
+ * DWORD 2
+ */
+#define NV30_VP_INST_SRC0L_SHIFT        26    /*NV20*/
+#define NV30_VP_INST_SRC0L_MASK         (0x3F  <<26)  /* NV30_VP_SRC0_LOW_MASK << 26 */
+#define NV30_VP_INST_SRC1_SHIFT         11    /*NV20*/
+#define NV30_VP_INST_SRC1_MASK          (0x7FFF<<11)  /*NV20*/
+#define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
+#define NV30_VP_INST_IADDR_SHIFT        2
+#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+
+/* DWORD 3 */
+#define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
+#define NV30_VP_INST_SRC2L_MASK          (0x0F  <<28)  /*NV20*/
+#define NV30_VP_INST_STEMP_WRITEMASK_SHIFT      24
+#define NV30_VP_INST_STEMP_WRITEMASK_MASK      (0x0F << 24)
+#define NV30_VP_INST_VTEMP_WRITEMASK_SHIFT      20
+#define NV30_VP_INST_VTEMP_WRITEMASK_MASK      (0x0F << 20)
+#define NV30_VP_INST_SDEST_WRITEMASK_SHIFT      16
+#define NV30_VP_INST_SDEST_WRITEMASK_MASK      (0x0F << 16)
+#define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
+#define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
+#define NV30_VP_INST_DEST_SHIFT        2
+#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#  define NV30_VP_INST_DEST_POS  0
+#  define NV30_VP_INST_DEST_BFC0  1
+#  define NV30_VP_INST_DEST_BFC1  2
+#  define NV30_VP_INST_DEST_COL0  3
+#  define NV30_VP_INST_DEST_COL1  4
+#  define NV30_VP_INST_DEST_FOGC  5
+#  define NV30_VP_INST_DEST_PSZ   6
+#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+
+#define NV30_VP_INST_LAST                           (1 << 0)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV30_VP_SRC0_HIGH_SHIFT                                                6
+#define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
+#define NV30_VP_SRC0_LOW_MASK                                         0x0000003F
+#define NV30_VP_SRC2_HIGH_SHIFT                                                4
+#define NV30_VP_SRC2_HIGH_MASK                                        0x00007FF0
+#define NV30_VP_SRC2_LOW_MASK                                         0x0000000F
+
+
+/* Source-register definition - matches NV20 exactly */
+#define NV30_VP_SRC_NEGATE          (1<<14)
+#define NV30_VP_SRC_SWZ_X_SHIFT        12
+#define NV30_VP_SRC_REG_SWZ_X_MASK        (0x03  <<12)
+#define NV30_VP_SRC_SWZ_Y_SHIFT        10
+#define NV30_VP_SRC_REG_SWZ_Y_MASK        (0x03  <<10)
+#define NV30_VP_SRC_SWZ_Z_SHIFT        8
+#define NV30_VP_SRC_REG_SWZ_Z_MASK        (0x03  << 8)
+#define NV30_VP_SRC_SWZ_W_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_W_MASK        (0x03  << 6)
+#define NV30_VP_SRC_REG_SWZ_ALL_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_ALL_MASK        (0xFF  << 6)
+#define NV30_VP_SRC_TEMP_SRC_SHIFT        2
+#define NV30_VP_SRC_REG_TEMP_ID_MASK        (0x0F  << 0)
+#define NV30_VP_SRC_REG_TYPE_SHIFT        0
+#define NV30_VP_SRC_REG_TYPE_MASK        (0x03  << 0)
+#define NV30_VP_SRC_REG_TYPE_TEMP  1
+#define NV30_VP_SRC_REG_TYPE_INPUT  2
+#define NV30_VP_SRC_REG_TYPE_CONST  3 /* guess */
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *   0 - Opcode, output reg/mask, ATTRIB source
+ *   1 - Source 0
+ *   2 - Source 1
+ *   3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *     result.color == R0.xyzw
+ *     result.depth == R1.z
+ * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
+ * otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *     ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
+ * is implemented simply by not writing to the relevant components of the destination.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *   LIT
+ *   LRP - MAD+MAD
+ *   SUB - ADD, negate second source
+ *   RSQ - LG2 + EX2
+ *   POW - LG2 + MUL + EX2
+ *   SCS - COS + SIN
+ *   XPD
+ */
+
+//== Opcode / Destination selection ==
+#define NV30_FP_OP_PROGRAM_END          (1 << 0)
+#define NV30_FP_OP_OUT_REG_SHIFT        1
+#define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV30_FP_OP_OUT_REG_HALF          (1 << 7)
+#define NV30_FP_OP_COND_WRITE_ENABLE        (1 << 8)
+#define NV30_FP_OP_OUTMASK_SHIFT        9
+#define NV30_FP_OP_OUTMASK_MASK          (0xF << 9)
+#  define NV30_FP_OP_OUT_X  (1<<9)
+#  define NV30_FP_OP_OUT_Y  (1<<10)
+#  define NV30_FP_OP_OUT_Z  (1<<11)
+#  define NV30_FP_OP_OUT_W  (1<<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV30_FP_OP_INPUT_SRC_SHIFT        13
+#define NV30_FP_OP_INPUT_SRC_MASK        (15 << 13)
+#  define NV30_FP_OP_INPUT_SRC_POSITION  0x0
+#  define NV30_FP_OP_INPUT_SRC_COL0  0x1
+#  define NV30_FP_OP_INPUT_SRC_COL1  0x2
+#  define NV30_FP_OP_INPUT_SRC_FOGC  0x3
+#  define NV30_FP_OP_INPUT_SRC_TC0    0x4
+#  define NV30_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
+#define NV30_FP_OP_TEX_UNIT_SHIFT        17
+#define NV30_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
+#define NV30_FP_OP_PRECISION_SHIFT        22
+#define NV30_FP_OP_PRECISION_MASK        (3 << 22)
+#   define NV30_FP_PRECISION_FP32  0
+#   define NV30_FP_PRECISION_FP16  1
+#   define NV30_FP_PRECISION_FX12  2
+#define NV30_FP_OP_OPCODE_SHIFT          24
+#define NV30_FP_OP_OPCODE_MASK          (0x3F << 24)
+#  define NV30_FP_OP_OPCODE_NOP  0x00
+#  define NV30_FP_OP_OPCODE_MOV  0x01
+#  define NV30_FP_OP_OPCODE_MUL  0x02
+#  define NV30_FP_OP_OPCODE_ADD  0x03
+#  define NV30_FP_OP_OPCODE_MAD  0x04
+#  define NV30_FP_OP_OPCODE_DP3  0x05
+#  define NV30_FP_OP_OPCODE_DP4  0x06
+#  define NV30_FP_OP_OPCODE_DST  0x07
+#  define NV30_FP_OP_OPCODE_MIN  0x08
+#  define NV30_FP_OP_OPCODE_MAX  0x09
+#  define NV30_FP_OP_OPCODE_SLT  0x0A
+#  define NV30_FP_OP_OPCODE_SGE  0x0B
+#  define NV30_FP_OP_OPCODE_SLE  0x0C
+#  define NV30_FP_OP_OPCODE_SGT  0x0D
+#  define NV30_FP_OP_OPCODE_SNE  0x0E
+#  define NV30_FP_OP_OPCODE_SEQ  0x0F
+#  define NV30_FP_OP_OPCODE_FRC  0x10
+#  define NV30_FP_OP_OPCODE_FLR  0x11
+#  define NV30_FP_OP_OPCODE_KIL  0x12
+#  define NV30_FP_OP_OPCODE_PK4B   0x13
+#  define NV30_FP_OP_OPCODE_UP4B   0x14
+#  define NV30_FP_OP_OPCODE_DDX  0x15 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_DDY  0x16 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_TEX  0x17
+#  define NV30_FP_OP_OPCODE_TXP  0x18
+#  define NV30_FP_OP_OPCODE_TXD  0x19
+#  define NV30_FP_OP_OPCODE_RCP  0x1A
+#  define NV30_FP_OP_OPCODE_RSQ  0x1B
+#  define NV30_FP_OP_OPCODE_EX2  0x1C
+#  define NV30_FP_OP_OPCODE_LG2  0x1D
+#  define NV30_FP_OP_OPCODE_LIT  0x1E
+#  define NV30_FP_OP_OPCODE_LRP  0x1F
+#  define NV30_FP_OP_OPCODE_STR  0x20 
+#  define NV30_FP_OP_OPCODE_SFL  0x21
+#  define NV30_FP_OP_OPCODE_COS  0x22
+#  define NV30_FP_OP_OPCODE_SIN  0x23
+#  define NV30_FP_OP_OPCODE_PK2H   0x24
+#  define NV30_FP_OP_OPCODE_UP2H   0x25
+#  define NV30_FP_OP_OPCODE_POW  0x26
+#  define NV30_FP_OP_OPCODE_PK4UB  0x27
+#  define NV30_FP_OP_OPCODE_UP4UB  0x28
+#  define NV30_FP_OP_OPCODE_PK2US  0x29
+#  define NV30_FP_OP_OPCODE_UP2US  0x2A
+#  define NV30_FP_OP_OPCODE_DP2A   0x2E
+#  define NV30_FP_OP_OPCODE_TXB  0x31
+#  define NV30_FP_OP_OPCODE_RFL  0x36
+#  define NV30_FP_OP_OPCODE_DIV  0x3A
+#define NV30_FP_OP_OUT_SAT          (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV30_FP_OP_OUT_ABS          (1 << 29)
+#define NV30_FP_OP_COND_SWZ_W_SHIFT        27
+#define NV30_FP_OP_COND_SWZ_W_MASK        (3 << 27)
+#define NV30_FP_OP_COND_SWZ_Z_SHIFT        25
+#define NV30_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
+#define NV30_FP_OP_COND_SWZ_Y_SHIFT        23
+#define NV30_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
+#define NV30_FP_OP_COND_SWZ_X_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_X_MASK        (3 << 21)
+#define NV30_FP_OP_COND_SWZ_ALL_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
+#define NV30_FP_OP_COND_SHIFT          18
+#define NV30_FP_OP_COND_MASK          (0x07 << 18)
+#  define NV30_FP_OP_COND_FL  0
+#  define NV30_FP_OP_COND_LT  1
+#  define NV30_FP_OP_COND_EQ  2
+#  define NV30_FP_OP_COND_LE  3
+#  define NV30_FP_OP_COND_GT  4
+#  define NV30_FP_OP_COND_NE  5
+#  define NV30_FP_OP_COND_GE  6
+#  define NV30_FP_OP_COND_TR  7
+
+/* high order bits of SRC1 */
+#define NV30_FP_OP_DST_SCALE_SHIFT        28
+#define NV30_FP_OP_DST_SCALE_MASK        (3 << 28)
+#define NV30_FP_OP_DST_SCALE_1X                                                0
+#define NV30_FP_OP_DST_SCALE_2X                                                1
+#define NV30_FP_OP_DST_SCALE_4X                                                2
+#define NV30_FP_OP_DST_SCALE_8X                                                3
+#define NV30_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV30_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV30_FP_OP_DST_SCALE_INV_8X                                            7
+
+
+/* high order bits of SRC2 */
+#define NV30_FP_OP_INDEX_INPUT          (1 << 30)
+
+//== Register selection ==
+#define NV30_FP_REG_TYPE_SHIFT          0
+#define NV30_FP_REG_TYPE_MASK          (3 << 0)
+#  define NV30_FP_REG_TYPE_TEMP  0
+#  define NV30_FP_REG_TYPE_INPUT  1
+#  define NV30_FP_REG_TYPE_CONST  2
+#define NV30_FP_REG_SRC_SHIFT          2 /* uncertain */
+#define NV30_FP_REG_SRC_MASK          (31 << 2)
+#define NV30_FP_REG_SRC_HALF          (1 << 8)
+#define NV30_FP_REG_SWZ_ALL_SHIFT        9
+#define NV30_FP_REG_SWZ_ALL_MASK        (255 << 9)
+#define NV30_FP_REG_SWZ_X_SHIFT          9
+#define NV30_FP_REG_SWZ_X_MASK          (3 << 9)
+#define NV30_FP_REG_SWZ_Y_SHIFT          11
+#define NV30_FP_REG_SWZ_Y_MASK          (3 << 11)
+#define NV30_FP_REG_SWZ_Z_SHIFT          13
+#define NV30_FP_REG_SWZ_Z_MASK          (3 << 13)
+#define NV30_FP_REG_SWZ_W_SHIFT          15
+#define NV30_FP_REG_SWZ_W_MASK          (3 << 15)
+#  define NV30_FP_SWIZZLE_X  0
+#  define NV30_FP_SWIZZLE_Y  1
+#  define NV30_FP_SWIZZLE_Z  2
+#  define NV30_FP_SWIZZLE_W  3
+#define NV30_FP_REG_NEGATE          (1 << 17)
+
+#define NV30SR_NONE	0
+#define NV30SR_OUTPUT	1
+#define NV30SR_INPUT	2
+#define NV30SR_TEMP	3
+#define NV30SR_CONST	4
+
+struct nv30_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv30_sreg
+nv30_sr(int type, int index)
+{
+	struct nv30_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_swz(struct nv30_sreg src, int x, int y, int z, int w)
+{
+	struct nv30_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_neg(struct nv30_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_abs(struct nv30_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_scale(struct nv30_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_state.c b/src/gallium/drivers/nv30/nv30_state.c
new file mode 100644
index 0000000000..26147565a5
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state.c
@@ -0,0 +1,725 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+static void *
+nv30_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		/* FIXME: Gallium assumes GL_EXT_blend_func_separate.
+		   It is not the case for NV30 */
+		so_method(so, rankine, NV34TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, rankine, NV34TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	if (cso->logicop_enable) {
+		so_method(so, rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, rankine, NV34TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nv30_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->blend = hwcso;
+	nv30->dirty |= NV30_NEW_BLEND;
+}
+
+static void
+nv30_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_CLAMP;
+		break;
+/*	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP;
+		break;*/
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV34TCL_TX_WRAP_S_SHIFT;
+}
+
+static void *
+nv30_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv30_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv30_sampler_state));
+
+	ps->fmt = 0;
+	/* TODO: Not all RECTs formats have this bit set, bits 15-8 of format
+	   are the tx format to use. We should store normalized coord flag
+	   in sampler state structure, and set appropriate format in
+	   nvxx_fragtex_build()
+	 */
+	/*NV34TCL_TX_FORMAT_RECT*/
+	/*if (!cso->normalized_coords) {
+		ps->fmt |= (1<<14) ;
+	}*/
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+
+	if (cso->max_anisotropy >= 8.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+	} else
+	if (cso->max_anisotropy >= 4.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+	} else
+	if (cso->max_anisotropy >= 2.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/;
+	}
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv30_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv30->tex_sampler[unit] = sampler[unit];
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv30->nr_samplers; unit++) {
+		nv30->tex_sampler[unit] = NULL;
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	nv30->nr_samplers = nr;
+	nv30->dirty |= NV30_NEW_SAMPLER;
+}
+
+static void
+nv30_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv30_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv30->tex_miptree[unit], miptree[unit]);
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv30->nr_textures; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv30->tex_miptree[unit], NULL);
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	nv30->nr_textures = nr;
+	nv30->dirty |= NV30_NEW_SAMPLER;
+}
+
+static void *
+nv30_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, rankine, NV34TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV34TCL_SHADE_MODEL_FLAT :
+				       NV34TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, rankine, NV34TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, rankine, NV34TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, rankine, NV34TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, rankine, NV34TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, rankine, NV34TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, rankine, NV34TCL_POINT_SPRITE, 1);
+	if (cso->point_sprite) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nv30_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->rasterizer = hwcso;
+	nv30->dirty |= NV30_NEW_RAST;
+	/*nv30->draw_dirty |= NV30_NEW_RAST;*/
+}
+
+static void
+nv30_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nv30_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+
+	so_method(so, rankine, NV34TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_ALPHA_FUNC_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref_value));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, rankine, NV34TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, rankine, NV34TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, rankine, NV34TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, rankine, NV34TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nv30_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->zsa = hwcso;
+	nv30->dirty |= NV30_NEW_ZSA;
+}
+
+static void
+nv30_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nv30_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	/*struct nv30_context *nv30 = nv30_context(pipe);*/
+	struct nv30_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nv30_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	/*vp->draw = draw_create_vertex_shader(nv30->draw, &vp->pipe);*/
+
+	return (void *)vp;
+}
+
+static void
+nv30_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->vertprog = hwcso;
+	nv30->dirty |= NV30_NEW_VERTPROG;
+	/*nv30->draw_dirty |= NV30_NEW_VERTPROG;*/
+}
+
+static void
+nv30_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_vertex_program *vp = hwcso;
+
+	/*draw_delete_vertex_shader(nv30->draw, vp->draw);*/
+	nv30_vertprog_destroy(nv30, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nv30_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv30_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv30_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->fragprog = hwcso;
+	nv30->dirty |= NV30_NEW_FRAGPROG;
+}
+
+static void
+nv30_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_fragment_program *fp = hwcso;
+
+	nv30_fragprog_destroy(nv30, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv30_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->blend_colour = *bcol;
+	nv30->dirty |= NV30_NEW_BCOL;
+}
+
+static void
+nv30_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv30_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->constbuf[shader] = buf->buffer;
+	nv30->constbuf_nr[shader] = buf->buffer->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv30->dirty |= NV30_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv30->dirty |= NV30_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv30_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->framebuffer = *fb;
+	nv30->dirty |= NV30_NEW_FB;
+}
+
+static void
+nv30_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->stipple, stipple->stipple, 4 * 32);
+	nv30->dirty |= NV30_NEW_STIPPLE;
+}
+
+static void
+nv30_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->scissor = *s;
+	nv30->dirty |= NV30_NEW_SCISSOR;
+}
+
+static void
+nv30_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->viewport = *vpt;
+	nv30->dirty |= NV30_NEW_VIEWPORT;
+	/*nv30->draw_dirty |= NV30_NEW_VIEWPORT;*/
+}
+
+static void
+nv30_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->vtxbuf, vb, sizeof(*vb) * count);
+	nv30->vtxbuf_nr = count;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+static void
+nv30_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->vtxelt, ve, sizeof(*ve) * count);
+	nv30->vtxelt_nr = count;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+static void
+nv30_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->edgeflags = bitfield;
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+void
+nv30_init_state_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_blend_state = nv30_blend_state_create;
+	nv30->pipe.bind_blend_state = nv30_blend_state_bind;
+	nv30->pipe.delete_blend_state = nv30_blend_state_delete;
+
+	nv30->pipe.create_sampler_state = nv30_sampler_state_create;
+	nv30->pipe.bind_sampler_states = nv30_sampler_state_bind;
+	nv30->pipe.delete_sampler_state = nv30_sampler_state_delete;
+	nv30->pipe.set_sampler_textures = nv30_set_sampler_texture;
+
+	nv30->pipe.create_rasterizer_state = nv30_rasterizer_state_create;
+	nv30->pipe.bind_rasterizer_state = nv30_rasterizer_state_bind;
+	nv30->pipe.delete_rasterizer_state = nv30_rasterizer_state_delete;
+
+	nv30->pipe.create_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_create;
+	nv30->pipe.bind_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_bind;
+	nv30->pipe.delete_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_delete;
+
+	nv30->pipe.create_vs_state = nv30_vp_state_create;
+	nv30->pipe.bind_vs_state = nv30_vp_state_bind;
+	nv30->pipe.delete_vs_state = nv30_vp_state_delete;
+
+	nv30->pipe.create_fs_state = nv30_fp_state_create;
+	nv30->pipe.bind_fs_state = nv30_fp_state_bind;
+	nv30->pipe.delete_fs_state = nv30_fp_state_delete;
+
+	nv30->pipe.set_blend_color = nv30_set_blend_color;
+	nv30->pipe.set_clip_state = nv30_set_clip_state;
+	nv30->pipe.set_constant_buffer = nv30_set_constant_buffer;
+	nv30->pipe.set_framebuffer_state = nv30_set_framebuffer_state;
+	nv30->pipe.set_polygon_stipple = nv30_set_polygon_stipple;
+	nv30->pipe.set_scissor_state = nv30_set_scissor_state;
+	nv30->pipe.set_viewport_state = nv30_set_viewport_state;
+
+	nv30->pipe.set_edgeflags = nv30_set_edgeflags;
+	nv30->pipe.set_vertex_buffers = nv30_set_vertex_buffers;
+	nv30->pipe.set_vertex_elements = nv30_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_state.h b/src/gallium/drivers/nv30/nv30_state.h
new file mode 100644
index 0000000000..2023278e37
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state.h
@@ -0,0 +1,88 @@
+#ifndef __NV30_STATE_H__
+#define __NV30_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv30_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv30_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv30_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv30_vertex_program {
+	struct pipe_shader_state pipe;
+
+	boolean translated;
+
+	struct nv30_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv30_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv30_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv30_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct pipe_texture *shadow_tex;
+	struct pipe_surface *shadow_surface;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_state_blend.c b/src/gallium/drivers/nv30/nv30_state_blend.c
new file mode 100644
index 0000000000..44d43e132a
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_blend.c
@@ -0,0 +1,40 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_blend_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->blend->so, &nv30->state.hw[NV30_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_blend = {
+	.validate = nv30_state_blend_validate,
+	.dirty = {
+		.pipe = NV30_NEW_BLEND,
+		.hw = NV30_STATE_BLEND
+	}
+};
+
+static boolean
+nv30_state_blend_colour_validate(struct nv30_context *nv30)
+{
+	struct nouveau_stateobj *so = so_new(2, 0);
+	struct pipe_blend_color *bcol = &nv30->blend_colour;
+
+	so_method(so, nv30->screen->rankine, NV34TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_BCOL]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_blend_colour = {
+	.validate = nv30_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NV30_NEW_BCOL,
+		.hw = NV30_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_emit.c b/src/gallium/drivers/nv30/nv30_state_emit.c
new file mode 100644
index 0000000000..f77b08ff69
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_emit.c
@@ -0,0 +1,118 @@
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+static struct nv30_state_entry *render_states[] = {
+	&nv30_state_framebuffer,
+	&nv30_state_rasterizer,
+	&nv30_state_scissor,
+	&nv30_state_stipple,
+	&nv30_state_fragprog,
+	&nv30_state_fragtex,
+	&nv30_state_vertprog,
+	&nv30_state_blend,
+	&nv30_state_blend_colour,
+	&nv30_state_zsa,
+	&nv30_state_viewport,
+	&nv30_state_vbo,
+	NULL
+};
+
+static void
+nv30_state_do_validate(struct nv30_context *nv30,
+		       struct nv30_state_entry **states)
+{
+	const struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+	unsigned i;
+
+	for (i = 0; i < fb->nr_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	while (*states) {
+		struct nv30_state_entry *e = *states;
+
+		if (nv30->dirty & e->dirty.pipe) {
+			if (e->validate(nv30)) {
+				nv30->state.dirty |= (1ULL << e->dirty.hw);
+			}
+		}
+
+		states++;
+	}
+	nv30->dirty = 0;
+}
+
+void
+nv30_state_emit(struct nv30_context *nv30)
+{
+	struct nv30_state *state = &nv30->state;
+	struct nv30_screen *screen = nv30->screen;
+	unsigned i, samplers;
+	uint64_t states;
+
+	if (nv30->pctx_id != screen->cur_pctx) {
+		for (i = 0; i < NV30_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_pctx = nv30->pctx_id;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv30->screen->state[i]);
+		if (state->hw[i])
+			so_emit(nv30->nvws, nv30->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	state->dirty = 0;
+
+	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(nv30->nvws,
+				      state->hw[NV30_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FRAGPROG]);
+	if (state->hw[NV30_STATE_VTXBUF] /*&& nv30->render_mode == HW*/)
+		so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_VTXBUF]);
+}
+
+boolean
+nv30_state_validate(struct nv30_context *nv30)
+{
+#if 0
+	boolean was_sw = nv30->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv30->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv30->fallback_swtnl & nv30->dirty)
+				!= nv30->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv30->pipe.flush(&nv30->pipe, 0, NULL);
+		nv30->dirty |= (NV30_NEW_VIEWPORT |
+				NV30_NEW_VERTPROG |
+				NV30_NEW_ARRAYS);
+		nv30->render_mode = HW;
+	}
+#endif
+	nv30_state_do_validate(nv30, render_states);
+#if 0
+	if (nv30->fallback_swtnl || nv30->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+#endif
+	return TRUE;
+}
diff --git a/src/gallium/drivers/nv30/nv30_state_fb.c b/src/gallium/drivers/nv30/nv30_state_fb.c
new file mode 100644
index 0000000000..77368cb205
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_fb.c
@@ -0,0 +1,144 @@
+#include "nv30_context.h"
+#include "nouveau/nouveau_util.h"
+
+static boolean
+nv30_state_framebuffer_validate(struct nv30_context *nv30)
+{
+	struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+	struct pipe_surface *rt[2], *zeta = NULL;
+	uint32_t rt_enable, rt_format;
+	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+	struct nv30_miptree *nv30mt;
+
+	rt_enable = 0;
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1)
+		rt_enable |= NV34TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	if (!(rt[0]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+		for (i = 1; i < fb->nr_cbufs; i++)
+			assert(!(rt[i]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
+
+		/* FIXME: NV34TCL_RT_FORMAT_LOG2_[WIDTH/HEIGHT] */
+		rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+		log2i(fb->width) << 16 /*NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT*/ |
+		log2i(fb->height) << 24 /*NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT*/;
+	}
+	else
+		rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR0) {
+		uint32_t pitch = rt[0]->stride;
+		if (zeta) {
+			pitch |= (zeta->stride << 16);
+		} else {
+			pitch |= (pitch << 16);
+		}
+
+		nv30mt = (struct nv30_miptree *)rt[0]->texture;
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR0, 1);
+		so_reloc (so, nv30mt->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_COLOR0_PITCH, 2);
+		so_data  (so, pitch);
+		so_reloc (so, nv30mt->buffer, rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
+		nv30mt = (struct nv30_miptree *)rt[1]->texture;
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR1, 1);
+		so_reloc (so, nv30mt->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, nv30mt->buffer, rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->stride);
+	}
+
+	if (zeta_format) {
+		nv30mt = (struct nv30_miptree *)zeta->texture;
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_ZETA, 1);
+		so_reloc (so, nv30mt->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_ZETA_OFFSET, 1);
+		so_reloc (so, nv30mt->buffer, zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		/* TODO: allocate LMA depth buffer */
+	}
+
+	so_method(so, nv30->screen->rankine, NV34TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv30->screen->rankine, NV34TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, nv30->screen->rankine, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+	/* Wonder why this is needed, context should all be set to zero on init */
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_TX_ORIGIN, 1);
+	so_data  (so, 0);
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_FB]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_framebuffer = {
+	.validate = nv30_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NV30_NEW_FB,
+		.hw = NV30_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_rasterizer.c b/src/gallium/drivers/nv30/nv30_state_rasterizer.c
new file mode 100644
index 0000000000..6d1b60e043
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_rasterizer_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->rasterizer->so,
+	       &nv30->state.hw[NV30_STATE_RAST]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_rasterizer = {
+	.validate = nv30_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NV30_NEW_RAST,
+		.hw = NV30_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_scissor.c b/src/gallium/drivers/nv30/nv30_state_scissor.c
new file mode 100644
index 0000000000..1db9bc1795
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_scissor.c
@@ -0,0 +1,35 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_scissor_validate(struct nv30_context *nv30)
+{
+	struct pipe_rasterizer_state *rast = &nv30->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nv30->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nv30->state.hw[NV30_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nv30->state.scissor_enabled == 0))
+		return FALSE;
+	nv30->state.scissor_enabled = rast->scissor;
+
+	so = so_new(3, 0);
+	so_method(so, nv30->screen->rankine, NV34TCL_SCISSOR_HORIZ, 2);
+	if (nv30->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_SCISSOR]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_scissor = {
+	.validate = nv30_state_scissor_validate,
+	.dirty = {
+		.pipe = NV30_NEW_SCISSOR | NV30_NEW_RAST,
+		.hw = NV30_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_stipple.c b/src/gallium/drivers/nv30/nv30_state_stipple.c
new file mode 100644
index 0000000000..41b42813b4
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_stipple.c
@@ -0,0 +1,39 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_stipple_validate(struct nv30_context *nv30)
+{
+	struct pipe_rasterizer_state *rast = &nv30->rasterizer->pipe;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nouveau_stateobj *so;
+
+	if (nv30->state.hw[NV30_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nv30->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(35, 0);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv30->stipple[i]);
+	} else {
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_STIPPLE]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_stipple = {
+	.validate = nv30_state_stipple_validate,
+	.dirty = {
+		.pipe = NV30_NEW_STIPPLE | NV30_NEW_RAST,
+		.hw = NV30_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_viewport.c b/src/gallium/drivers/nv30/nv30_state_viewport.c
new file mode 100644
index 0000000000..951d40ebfd
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_viewport.c
@@ -0,0 +1,70 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_viewport_validate(struct nv30_context *nv30)
+{
+	struct pipe_viewport_state *vpt = &nv30->viewport;
+	struct nouveau_stateobj *so;
+	unsigned bypass;
+
+	if (/*nv30->render_mode == HW &&*/ !nv30->rasterizer->pipe.bypass_clipping)
+		bypass = 0;
+	else
+		bypass = 1;
+
+	if (nv30->state.hw[NV30_STATE_VIEWPORT] &&
+	    (bypass || !(nv30->dirty & NV30_NEW_VIEWPORT)) &&
+	    nv30->state.viewport_bypass == bypass)
+		return FALSE;
+	nv30->state.viewport_bypass = bypass;
+
+	so = so_new(11, 0);
+	if (!bypass) {
+		so_method(so, nv30->screen->rankine,
+			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+/*		so_method(so, nv30->screen->rankine, 0x1d78, 1);
+		so_data  (so, 1);
+*/	} else {
+		so_method(so, nv30->screen->rankine,
+			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+/*		so_method(so, nv30->screen->rankine, 0x1d78, 1);
+		so_data  (so, 0x110);
+*/	}
+	/* TODO/FIXME: never saw value 0x0110 in renouveau dumps, only 0x0001 */
+	so_method(so, nv30->screen->rankine, 0x1d78, 1);
+	so_data  (so, 1);
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_VIEWPORT]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_viewport = {
+	.validate = nv30_state_viewport_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VIEWPORT | NV30_NEW_RAST,
+		.hw = NV30_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_zsa.c b/src/gallium/drivers/nv30/nv30_state_zsa.c
new file mode 100644
index 0000000000..0940b7269b
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_zsa.c
@@ -0,0 +1,17 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_zsa_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->zsa->so,
+	       &nv30->state.hw[NV30_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_zsa = {
+	.validate = nv30_state_zsa_validate,
+	.dirty = {
+		.pipe = NV30_NEW_ZSA,
+		.hw = NV30_STATE_ZSA
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_surface.c b/src/gallium/drivers/nv30/nv30_surface.c
new file mode 100644
index 0000000000..0f8dc12045
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv30_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv30_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv04_surface_2d *eng2d = nv30->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv30_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv04_surface_2d *eng2d = nv30->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv30_init_surface_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.surface_copy = nv30_surface_copy;
+	nv30->pipe.surface_fill = nv30_surface_fill;
+}
diff --git a/src/gallium/drivers/nv30/nv30_vbo.c b/src/gallium/drivers/nv30/nv30_vbo.c
new file mode 100644
index 0000000000..2d6d48ac16
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_vbo.c
@@ -0,0 +1,556 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+#define FORCE_SWTNL 0
+
+static INLINE int
+nv30_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nv30_vbo_set_idxbuf(struct nv30_context *nv30, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nv30->screen->pipe;
+	unsigned type;
+
+	if (!ib) {
+		nv30->idxbuf = NULL;
+		nv30->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nv30->idxbuf ||
+	    type != nv30->idxbuf_format) {
+		nv30->dirty |= NV30_NEW_ARRAYS;
+		nv30->idxbuf = ib;
+		nv30->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nv30_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV34TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_channel *chan = nv30->nvws->channel;
+	unsigned restart = 0;
+
+	nv30_vbo_set_idxbuf(nv30, NULL, 0);
+	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
+		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);*/
+		return FALSE;
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(rankine, NV34TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static boolean
+nv30_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nv30_draw_elements_u08(nv30, map, mode, start, count);
+		break;
+	case 2:
+		nv30_draw_elements_u16(nv30, map, mode, start, count);
+		break;
+	case 4:
+		nv30_draw_elements_u32(nv30, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	ws->buffer_unmap(ws, ib);
+	return TRUE;
+}
+
+static boolean
+nv30_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_channel *chan = nv30->nvws->channel;
+	unsigned restart = 0;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(rankine, NV34TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv30_vbo_set_idxbuf(nv30, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
+		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);*/
+		return FALSE;	
+	}
+
+	if (idxbuf) {
+		nv30_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nv30_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static boolean
+nv30_vbo_validate(struct nv30_context *nv30)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct pipe_buffer *ib = nv30->idxbuf;
+	unsigned ib_format = nv30->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	if (nv30->edgeflags) {
+		/*nv30->fallback_swtnl |= NV30_NEW_ARRAYS;*/
+		return FALSE;
+	}
+
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, rankine, NV34TCL_VTXBUF_ADDRESS(0), nv30->vtxelt_nr);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, rankine, NV34TCL_VTXFMT(0), nv30->vtxelt_nr);
+
+	for (hw = 0; hw < nv30->vtxelt_nr; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nv30->vtxelt[hw];
+		vb = &nv30->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->stride) {
+			if (!sattr)
+				sattr = so_new(16 * 5, 0);
+
+			if (nv30_vbo_static_attrib(nv30, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV34TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nv30_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			/*nv30->fallback_swtnl |= NV30_NEW_ARRAYS;*/
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		so_method(vtxbuf, rankine, NV34TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, rankine, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nv30->state.hw[NV30_STATE_VTXBUF]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXBUF);
+	so_ref(vtxfmt, &nv30->state.hw[NV30_STATE_VTXFMT]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXFMT);
+	so_ref(sattr, &nv30->state.hw[NV30_STATE_VTXATTR]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nv30_state_entry nv30_state_vbo = {
+	.validate = nv30_vbo_validate,
+	.dirty = {
+		.pipe = NV30_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_vertprog.c b/src/gallium/drivers/nv30/nv30_vertprog.c
new file mode 100644
index 0000000000..d262725057
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_vertprog.c
@@ -0,0 +1,838 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  2. Arb. swz/negation
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+
+struct nv30_vpc {
+	struct nv30_vertex_program *vp;
+
+	struct nv30_vertex_program_exec *vpi;
+
+	unsigned output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	struct nv30_sreg *imm;
+	unsigned nr_imm;
+};
+
+static struct nv30_sreg
+temp(struct nv30_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static struct nv30_sreg
+constant(struct nv30_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	struct nv30_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv30_sr(NV30SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_vp_arith((cc), (s), NV30_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv30_vpc *vpc, uint32_t *hw, int pos, struct nv30_sreg src)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_TEMP:
+		sr |= (NV30_VP_SRC_REG_TYPE_TEMP << NV30_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV30SR_INPUT:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV30_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		sr |= (NV30_VP_SRC_REG_TYPE_CONST <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV30_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_VP_SRC_SWZ_W_SHIFT));
+
+/*
+ * |VVV|
+ * d�.�b
+ *  \u/
+ *
+ */
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV30_VP_SRC0_HIGH_MASK) >>
+			  NV30_VP_SRC0_HIGH_SHIFT) << NV30_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV30_VP_SRC0_LOW_MASK) <<
+			  NV30_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV30_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV30_VP_SRC2_HIGH_MASK) >>
+			  NV30_VP_SRC2_HIGH_SHIFT) << NV30_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV30_VP_SRC2_LOW_MASK) <<
+			  NV30_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv30_vpc *vpc, uint32_t *hw, int slot, struct nv30_sreg dst)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		switch (dst.index) {
+		case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV30_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+		hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+		/*XXX: no way this is entirely correct, someone needs to
+		 *     figure out what exactly it is.
+		 */
+		hw[3] |= 0x800;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv30_vp_arith(struct nv30_vpc *vpc, int slot, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1,
+	      struct nv30_sreg s2)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV30_VP_INST_COND_TR << NV30_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV30_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV30_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV30_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV30_VP_INST_COND_SWZ_W_SHIFT));
+
+	hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//	hw[3] |= NV30_VP_INST_SCA_DEST_TEMP_MASK;
+//	hw[3] |= (mask << NV30_VP_INST_VEC_WRITEMASK_SHIFT);
+
+	if (dst.type == NV30SR_OUTPUT) {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+	} else {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv30_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = nv30_sr(NV30SR_OUTPUT,
+			      vpc->output_map[fdst->DstRegister.Index]);
+
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = nv30_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		if (vpc->high_temp < dst.index)
+			vpc->high_temp = dst.index;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv30_vertprog_parse_instruction(struct nv30_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv30_sreg src[3], dst, tmp;
+	struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		/*XXX: index comparison is broken now that consts come from
+		 *     two different register files.
+		 */
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_parse_decl_output(struct nv30_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV30_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_prepare(struct nv30_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int nr_imm = 0;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv30_sreg));
+		assert(vpc->imm);
+	}
+
+	return TRUE;
+}
+
+static void
+nv30_vertprog_translate(struct nv30_context *nv30,
+			struct nv30_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_vpc *vpc = NULL;
+
+	tgsi_dump(vp->pipe.tokens,0);
+
+	vpc = CALLOC(1, sizeof(struct nv30_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	if (!nv30_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(imm->Immediate.NrTokens == 4 + 1);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV30_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(vpc);
+}
+
+static boolean
+nv30_vertprog_validate(struct nv30_context *nv30)
+{ 
+	struct nouveau_winsys *nvws = nv30->nvws;
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	vp = nv30->vertprog;
+	constbuf = nv30->constbuf[PIPE_SHADER_VERTEX];
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated) {
+		nv30_vertprog_translate(nv30, vp);
+		if (!vp->translated)
+			return FALSE;
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv30->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv30->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV30_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv30_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf) {
+			ws->buffer_unmap(ws, constbuf);
+		}
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP inst %d: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, vp->insns[i].data[0], vp->insns[i].data[1],
+				vp->insns[i].data[2], vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv30->state.hw[NV30_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv30->state.hw[NV30_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv30_vertprog_destroy(struct nv30_context *nv30, struct nv30_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv30->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv30_state_entry nv30_state_vertprog = {
+	.validate = nv30_vertprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VERTPROG /*| NV30_NEW_UCP*/,
+		.hw = NV30_STATE_VERTPROG,
+	}
+};
diff --git a/src/gallium/drivers/nv40/Makefile b/src/gallium/drivers/nv40/Makefile
new file mode 100644
index 0000000000..9c8eadf7e4
--- /dev/null
+++ b/src/gallium/drivers/nv40/Makefile
@@ -0,0 +1,37 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv40
+
+DRIVER_SOURCES = \
+	nv40_clear.c \
+	nv40_context.c \
+	nv40_draw.c \
+	nv40_fragprog.c \
+	nv40_fragtex.c \
+	nv40_miptree.c \
+	nv40_query.c \
+	nv40_screen.c \
+	nv40_state.c \
+	nv40_state_blend.c \
+	nv40_state_emit.c \
+	nv40_state_fb.c \
+	nv40_state_rasterizer.c \
+	nv40_state_scissor.c \
+	nv40_state_stipple.c \
+	nv40_state_viewport.c \
+	nv40_state_zsa.c \
+	nv40_surface.c \
+	nv40_vbo.c \
+	nv40_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv40/nv40_clear.c b/src/gallium/drivers/nv40/nv40_clear.c
new file mode 100644
index 0000000000..59efd620e3
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+
+void
+nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
new file mode 100644
index 0000000000..5d325f5067
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -0,0 +1,72 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+static void
+nv40_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	FIRE_RING(fence);
+}
+
+static void
+nv40_destroy(struct pipe_context *pipe)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	if (nv40->draw)
+		draw_destroy(nv40->draw);
+	FREE(nv40);
+}
+
+struct pipe_context *
+nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_context *nv40;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv40 = CALLOC(1, sizeof(struct nv40_context));
+	if (!nv40)
+		return NULL;
+	nv40->screen = screen;
+	nv40->pctx_id = pctx_id;
+
+	nv40->nvws = nvws;
+
+	nv40->pipe.winsys = ws;
+	nv40->pipe.screen = pscreen;
+	nv40->pipe.destroy = nv40_destroy;
+	nv40->pipe.draw_arrays = nv40_draw_arrays;
+	nv40->pipe.draw_elements = nv40_draw_elements;
+	nv40->pipe.clear = nv40_clear;
+	nv40->pipe.flush = nv40_flush;
+
+	nv40_init_query_functions(nv40);
+	nv40_init_surface_functions(nv40);
+	nv40_init_state_functions(nv40);
+
+	/* Create, configure, and install fallback swtnl path */
+	nv40->draw = draw_create();
+	draw_wide_point_threshold(nv40->draw, 9999999.0);
+	draw_wide_line_threshold(nv40->draw, 9999999.0);
+	draw_enable_line_stipple(nv40->draw, FALSE);
+	draw_enable_point_sprites(nv40->draw, FALSE);
+	draw_set_rasterize_stage(nv40->draw, nv40_draw_render_stage(nv40));
+
+	return &nv40->pipe;
+}
+	
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
new file mode 100644
index 0000000000..adcfbdd85a
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -0,0 +1,233 @@
+#ifndef __NV40_CONTEXT_H__
+#define __NV40_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv40_screen *ctx = nv40->screen
+#include "nouveau/nouveau_push.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv40_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nv40_state_index {
+	NV40_STATE_FB = 0,
+	NV40_STATE_VIEWPORT = 1,
+	NV40_STATE_BLEND = 2,
+	NV40_STATE_RAST = 3,
+	NV40_STATE_ZSA = 4,
+	NV40_STATE_BCOL = 5,
+	NV40_STATE_CLIP = 6,
+	NV40_STATE_SCISSOR = 7,
+	NV40_STATE_STIPPLE = 8,
+	NV40_STATE_FRAGPROG = 9,
+	NV40_STATE_VERTPROG = 10,
+	NV40_STATE_FRAGTEX0 = 11,
+	NV40_STATE_FRAGTEX1 = 12,
+	NV40_STATE_FRAGTEX2 = 13,
+	NV40_STATE_FRAGTEX3 = 14,
+	NV40_STATE_FRAGTEX4 = 15,
+	NV40_STATE_FRAGTEX5 = 16,
+	NV40_STATE_FRAGTEX6 = 17,
+	NV40_STATE_FRAGTEX7 = 18,
+	NV40_STATE_FRAGTEX8 = 19,
+	NV40_STATE_FRAGTEX9 = 20,
+	NV40_STATE_FRAGTEX10 = 21,
+	NV40_STATE_FRAGTEX11 = 22,
+	NV40_STATE_FRAGTEX12 = 23,
+	NV40_STATE_FRAGTEX13 = 24,
+	NV40_STATE_FRAGTEX14 = 25,
+	NV40_STATE_FRAGTEX15 = 26,
+	NV40_STATE_VERTTEX0 = 27,
+	NV40_STATE_VERTTEX1 = 28,
+	NV40_STATE_VERTTEX2 = 29,
+	NV40_STATE_VERTTEX3 = 30,
+	NV40_STATE_VTXBUF = 31,
+	NV40_STATE_VTXFMT = 32,
+	NV40_STATE_VTXATTR = 33,
+	NV40_STATE_MAX = 34
+};
+
+#include "nv40_screen.h"
+
+#define NV40_NEW_BLEND		(1 <<  0)
+#define NV40_NEW_RAST		(1 <<  1)
+#define NV40_NEW_ZSA		(1 <<  2)
+#define NV40_NEW_SAMPLER	(1 <<  3)
+#define NV40_NEW_FB		(1 <<  4)
+#define NV40_NEW_STIPPLE	(1 <<  5)
+#define NV40_NEW_SCISSOR	(1 <<  6)
+#define NV40_NEW_VIEWPORT	(1 <<  7)
+#define NV40_NEW_BCOL		(1 <<  8)
+#define NV40_NEW_VERTPROG	(1 <<  9)
+#define NV40_NEW_FRAGPROG	(1 << 10)
+#define NV40_NEW_ARRAYS		(1 << 11)
+#define NV40_NEW_UCP		(1 << 12)
+
+struct nv40_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nv40_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned viewport_bypass;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NV40_STATE_MAX];
+};
+
+struct nv40_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv40_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nv40_state state;
+	struct {
+		struct nv40_vertex_program *vertprog;
+
+		unsigned nr_attribs;
+		unsigned hw[PIPE_MAX_SHADER_INPUTS];
+		unsigned draw[PIPE_MAX_SHADER_INPUTS];
+		unsigned emit[PIPE_MAX_SHADER_INPUTS];
+	} swtnl;
+
+	enum {
+		HW, SWTNL, SWRAST
+	} render_mode;
+	unsigned fallback_swtnl;
+	unsigned fallback_swrast;
+
+	/* Context state */
+	unsigned dirty, draw_dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct pipe_clip_state clip;
+	struct nv40_vertex_program *vertprog;
+	struct nv40_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nv40_rasterizer_state *rasterizer;
+	struct nv40_zsa_state *zsa;
+	struct nv40_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nv40_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv40_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	const unsigned *edgeflags;
+};
+
+static INLINE struct nv40_context *
+nv40_context(struct pipe_context *pipe)
+{
+	return (struct nv40_context *)pipe;
+}
+
+struct nv40_state_entry {
+	boolean (*validate)(struct nv40_context *nv40);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern void nv40_init_state_functions(struct nv40_context *nv40);
+extern void nv40_init_surface_functions(struct nv40_context *nv40);
+extern void nv40_init_query_functions(struct nv40_context *nv40);
+
+extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv40_draw.c */
+extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
+extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe,
+					struct pipe_buffer *idxbuf,
+					unsigned ib_size, unsigned mode,
+					unsigned start, unsigned count);
+
+/* nv40_vertprog.c */
+extern void nv40_vertprog_destroy(struct nv40_context *,
+				  struct nv40_vertex_program *);
+
+/* nv40_fragprog.c */
+extern void nv40_fragprog_destroy(struct nv40_context *,
+				  struct nv40_fragment_program *);
+
+/* nv40_fragtex.c */
+extern void nv40_fragtex_bind(struct nv40_context *);
+
+/* nv40_state.c and friends */
+extern boolean nv40_state_validate(struct nv40_context *nv40);
+extern boolean nv40_state_validate_swtnl(struct nv40_context *nv40);
+extern void nv40_state_emit(struct nv40_context *nv40);
+extern struct nv40_state_entry nv40_state_rasterizer;
+extern struct nv40_state_entry nv40_state_scissor;
+extern struct nv40_state_entry nv40_state_stipple;
+extern struct nv40_state_entry nv40_state_fragprog;
+extern struct nv40_state_entry nv40_state_vertprog;
+extern struct nv40_state_entry nv40_state_blend;
+extern struct nv40_state_entry nv40_state_blend_colour;
+extern struct nv40_state_entry nv40_state_zsa;
+extern struct nv40_state_entry nv40_state_viewport;
+extern struct nv40_state_entry nv40_state_framebuffer;
+extern struct nv40_state_entry nv40_state_fragtex;
+extern struct nv40_state_entry nv40_state_vbo;
+extern struct nv40_state_entry nv40_state_vtxfmt;
+
+/* nv40_vbo.c */
+extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv40_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv40_clear.c */
+extern void nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
new file mode 100644
index 0000000000..c83ff91d7e
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -0,0 +1,349 @@
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_pack_color.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pipe.h"
+
+#include "nv40_context.h"
+#define NV40_SHADER_NO_FUCKEDNESS
+#include "nv40_shader.h"
+
+/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
+ * often at all.  Uses "quadro style" vertex submission + a fixed vertex
+ * layout to avoid the need to generate a vertex program or vtxfmt.
+ */
+
+struct nv40_render_stage {
+	struct draw_stage stage;
+	struct nv40_context *nv40;
+	unsigned prim;
+};
+
+static INLINE struct nv40_render_stage *
+nv40_render_stage(struct draw_stage *stage)
+{
+	return (struct nv40_render_stage *)stage;
+}
+
+static INLINE void
+nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
+{
+	unsigned i;
+
+	for (i = 0; i < nv40->swtnl.nr_attribs; i++) {
+		unsigned idx = nv40->swtnl.draw[i];
+		unsigned hw = nv40->swtnl.hw[i];
+
+		switch (nv40->swtnl.emit[i]) {
+		case EMIT_OMIT:
+			break;
+		case EMIT_1F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (fui(v->data[idx][0]));
+			break;
+		case EMIT_2F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			break;
+		case EMIT_3F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			break;
+		case EMIT_4F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			OUT_RING  (fui(v->data[idx][3]));
+			break;
+		case EMIT_4UB:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (pack_ub4(float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][3])));
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+}
+
+static INLINE void
+nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
+	       unsigned mode, unsigned count)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(stage);
+	struct nv40_context *nv40 = rs->nv40;
+	struct nouveau_pushbuf *pb = nv40->nvws->channel->pushbuf;
+	unsigned i;
+
+	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
+	if (pb->remaining < ((count * 20) + 6)) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			NOUVEAU_ERR("AIII, missed flush\n");
+			assert(0);
+		}
+		FIRE_RING(NULL);
+		nv40_state_emit(nv40);
+	}
+
+	/* Switch primitive modes if necessary */
+	if (rs->prim != mode) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+			OUT_RING  (NV40TCL_BEGIN_END_STOP);	
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (mode);
+		rs->prim = mode;
+	}
+
+	/* Emit vertex data */
+	for (i = 0; i < count; i++)
+		nv40_render_vertex(nv40, prim->v[i]);
+
+	/* If it's likely we'll need to empty the push buffer soon, finish
+	 * off the primitive now.
+	 */
+	if (pb->remaining < ((count * 20) + 6)) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_POINTS, 1);
+}
+
+static void
+nv40_render_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_LINES, 2);
+}
+
+static void
+nv40_render_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_TRIANGLES, 3);
+}
+
+static void
+nv40_render_flush(struct draw_stage *draw, unsigned flags)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(draw);
+	struct nv40_context *nv40 = rs->nv40;
+
+	if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_reset_stipple_counter(struct draw_stage *draw)
+{
+}
+
+static void
+nv40_render_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+static INLINE void
+emit_mov(struct nv40_vertex_program *vp,
+	 unsigned dst, unsigned src, unsigned vor, unsigned mask)
+{
+	struct nv40_vertex_program_exec *inst;
+
+	vp->insns = realloc(vp->insns,
+			    sizeof(struct nv40_vertex_program_exec) *
+			    ++vp->nr_insns);
+	inst = &vp->insns[vp->nr_insns - 1];
+
+	inst->data[0] = 0x401f9c6c;
+	inst->data[1] = 0x0040000d | (src << 8);
+	inst->data[2] = 0x8106c083;
+	inst->data[3] = 0x6041ff80 | (dst << 2) | (mask << 13);
+	inst->const_index = -1;
+	inst->has_branch_offset = FALSE;
+
+	vp->ir |= (1 << src);
+	if (vor != ~0)
+		vp->or |= (1 << vor);
+}
+
+static struct nv40_vertex_program *
+create_drawvp(struct nv40_context *nv40)
+{
+	struct nv40_vertex_program *vp = CALLOC_STRUCT(nv40_vertex_program);
+	unsigned i;
+
+	emit_mov(vp, NV40_VP_INST_DEST_POS, 0, ~0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL0, 3, 0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL1, 4, 1, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC0, 3, 2, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC1, 4, 3, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_FOGC, 5, 4, 0x8);
+	for (i = 0; i < 8; i++)
+		emit_mov(vp, NV40_VP_INST_DEST_TC(i), 8 + i, 14 + i, 0xf);
+
+	vp->insns[vp->nr_insns - 1].data[3] |= 1;
+	vp->translated = TRUE;
+	return vp;
+}
+
+struct draw_stage *
+nv40_draw_render_stage(struct nv40_context *nv40)
+{
+	struct nv40_render_stage *render = CALLOC_STRUCT(nv40_render_stage);
+
+	if (!nv40->swtnl.vertprog)
+		nv40->swtnl.vertprog = create_drawvp(nv40);
+
+	render->nv40 = nv40;
+	render->stage.draw = nv40->draw;
+	render->stage.point = nv40_render_point;
+	render->stage.line = nv40_render_line;
+	render->stage.tri = nv40_render_tri;
+	render->stage.flush = nv40_render_flush;
+	render->stage.reset_stipple_counter = nv40_render_reset_stipple_counter;
+	render->stage.destroy = nv40_render_destroy;
+
+	return &render->stage;
+}
+
+boolean
+nv40_draw_elements_swtnl(struct pipe_context *pipe,
+			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
+			 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	unsigned i;
+	void *map;
+
+	if (!nv40_state_validate_swtnl(nv40))
+		return FALSE;
+	nv40->state.dirty &= ~(1ULL << NV40_STATE_VTXBUF);
+	nv40_state_emit(nv40);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++) {
+		map = ws->buffer_map(ws, nv40->vtxbuf[i].buffer,
+                                      PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_vertex_buffer(nv40->draw, i, map);
+	}
+
+	if (idxbuf) {
+		map = ws->buffer_map(ws, idxbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(nv40->draw, idxbuf_size, map);
+	} else {
+		draw_set_mapped_element_buffer(nv40->draw, 0, NULL);
+	}
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX]) {
+		const unsigned nr = nv40->constbuf_nr[PIPE_SHADER_VERTEX];
+
+		map = ws->buffer_map(ws, nv40->constbuf[PIPE_SHADER_VERTEX],
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_constant_buffer(nv40->draw, map, nr);
+	}
+
+	draw_arrays(nv40->draw, mode, start, count);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++)
+		ws->buffer_unmap(ws, nv40->vtxbuf[i].buffer);
+
+	if (idxbuf)
+		ws->buffer_unmap(ws, idxbuf);
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX])
+		ws->buffer_unmap(ws, nv40->constbuf[PIPE_SHADER_VERTEX]);
+
+	draw_flush(nv40->draw);
+	pipe->flush(pipe, 0, NULL);
+
+	return TRUE;
+}
+
+static INLINE void
+emit_attrib(struct nv40_context *nv40, unsigned hw, unsigned emit,
+	    unsigned semantic, unsigned index)
+{
+	unsigned draw_out = draw_find_vs_output(nv40->draw, semantic, index);
+	unsigned a = nv40->swtnl.nr_attribs++;
+
+	nv40->swtnl.hw[a] = hw;
+	nv40->swtnl.emit[a] = emit;
+	nv40->swtnl.draw[a] = draw_out;
+}
+
+static boolean
+nv40_state_vtxfmt_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	unsigned colour = 0, texcoords = 0, fog = 0, i;
+
+	/* Determine needed fragprog inputs */
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			colour |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = 1;
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	nv40->swtnl.nr_attribs = 0;
+
+	/* Map draw vtxprog output to hw attribute IDs */
+	for (i = 0; i < 2; i++) {
+		if (!(colour & (1 << i)))
+			continue;
+		emit_attrib(nv40, 3 + i, EMIT_4UB, TGSI_SEMANTIC_COLOR, i);
+	}
+
+	for (i = 0; i < 8; i++) {
+		if (!(texcoords & (1 << i)))
+			continue;
+		emit_attrib(nv40, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
+	}
+
+	if (fog) {
+		emit_attrib(nv40, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
+	}
+
+	emit_attrib(nv40, 0, EMIT_3F, TGSI_SEMANTIC_POSITION, 0);
+
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vtxfmt = {
+	.validate = nv40_state_vtxfmt_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
new file mode 100644
index 0000000000..91dcbebda0
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -0,0 +1,991 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV40_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV40_FP_OP_COND_TR
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+#define scale(s,v) nv40_sr_scale((s), NV40_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv40_fpc {
+	struct nv40_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_temp;
+
+	int num_regs;
+
+	unsigned inst_offset;
+	unsigned have_const;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv40_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv40_sreg
+temp(struct nv40_fpc *fpc)
+{
+	int idx = ffs(~fpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	fpc->r_temps |= (1 << idx);
+	fpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_fpc *fpc)
+{
+	fpc->r_temps &= ~fpc->r_temps_discard;
+	fpc->r_temps_discard = 0;
+}
+
+static INLINE struct nv40_sreg
+constant(struct nv40_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_fp_arith((cc), (s), NV40_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv40_fp_tex((cc), (s), NV40_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv40_fpc *fpc, int size)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv40_fpc *fpc, int pos, struct nv40_sreg src)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_INPUT:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV40_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_OUTPUT:
+		sr |= NV40_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV40SR_TEMP:
+		sr |= (NV40_FP_REG_TYPE_TEMP << NV40_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_FP_REG_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		if (!fpc->have_const) {
+			grow_insns(fpc, 4);
+			fpc->have_const = 1;
+		}
+
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv40_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV40_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv40_fpc *fpc, struct nv40_sreg dst)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV40SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV40_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV40SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV40_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv40_fp_arith(struct nv40_fpc *fpc, int sat, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	fpc->have_const = 0;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV40_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV40TCL_FP_CONTROL_KIL;
+	hw[0] |= (op << NV40_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV40_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV40_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV40_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV40_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV40_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV40_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV40_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV40_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV40_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv40_fp_tex(struct nv40_fpc *fpc, int sat, int op, int unit,
+	    struct nv40_sreg dst, int mask,
+	    struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	nv40_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV40_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = fpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	/* NV40 fragprog result regs are just temps, so this is simple */
+	case TGSI_FILE_OUTPUT:
+		src = fpc->r_result[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		return fpc->r_result[fdst->DstRegister.Index];
+	case TGSI_FILE_TEMPORARY:
+		return fpc->r_temp[fdst->DstRegister.Index];
+	case TGSI_FILE_NULL:
+		return nv40_sr(NV40SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv40_sr(NV40SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg src[3], dst, tmp;
+	int mask, sat, unit;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV40_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DDX:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DDY:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nv40_sr(NV40SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV40_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		tmp = temp(fpc);
+		arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_NOISE1:
+	case TGSI_OPCODE_NOISE2:
+	case TGSI_OPCODE_NOISE3:
+	case TGSI_OPCODE_NOISE4:
+		arith(fpc, sat, SFL, dst, mask, none, none, none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, tmp, MASK_X,
+		      swz(src[0], X, X, X, X), none, none);
+		arith(fpc, 0, MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(fpc, sat, EX2, dst, mask,
+		      swz(tmp, X, X, X, X), none, none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[0], none);
+		arith(fpc, 0, DP3, tmp, MASK_Y, src[0], src[1], none);
+		arith(fpc, 0, DIV, scale(tmp, 2X), MASK_Z,
+		      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+		arith(fpc, sat, MAD, dst, mask,
+		      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		break;
+	case TGSI_OPCODE_RSQ:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, scale(tmp, INV_2X), MASK_X,
+		      abs(swz(src[0], X, X, X, X)), none, none);
+		arith(fpc, sat, EX2, dst, mask,
+		      neg(swz(tmp, X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(fpc);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_output(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	unsigned hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = 1;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		switch (fdec->Semantic.SemanticIndex) {
+		case 0: hw = 0; break;
+		case 1: hw = 2; break;
+		case 2: hw = 3; break;
+		case 3: hw = 4; break;
+		default:
+			NOUVEAU_ERR("bad rcol index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	fpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	fpc->r_temps |= (1 << hw);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_prepare(struct nv40_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, i;
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv40_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}
+
+	return TRUE;
+
+out_err:
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nv40_fragprog_translate(struct nv40_context *nv40,
+			struct nv40_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_fpc *fpc = NULL;
+
+	fpc = CALLOC(1, sizeof(struct nv40_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->num_regs = 2;
+
+	if (!nv40_fragprog_prepare(fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	FREE(fpc);
+}
+
+static void
+nv40_fragprog_upload(struct nv40_context *nv40,
+		     struct nv40_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	ws->buffer_unmap(ws, fp->buffer);
+}
+
+static boolean
+nv40_fragprog_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct pipe_buffer *constbuf =
+		nv40->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	nv40->fallback_swrast &= ~NV40_NEW_FRAGPROG;
+	nv40_fragprog_translate(nv40, fp);
+	if (!fp->translated) {
+		nv40->fallback_swrast |= NV40_NEW_FRAGPROG;
+		return FALSE;
+	}
+
+	fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
+	nv40_fragprog_upload(nv40, fp);
+
+	so = so_new(4, 1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	so_ref(so, &fp->so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+		
+		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv40_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		ws->buffer_unmap(ws, constbuf);
+
+		if (new_consts)
+			nv40_fragprog_upload(nv40, fp);
+	}
+
+	if (new_consts || fp->so != nv40->state.hw[NV40_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nv40->state.hw[NV40_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_fragprog_destroy(struct nv40_context *nv40,
+		      struct nv40_fragment_program *fp)
+{
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nv40_state_entry nv40_state_fragprog = {
+	.validate = nv40_fragprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FRAGPROG,
+		.hw = NV40_STATE_FRAGPROG
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
new file mode 100644
index 0000000000..0227d22620
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -0,0 +1,168 @@
+#include "nv40_context.h"
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
+  (NV40TCL_TEX_SWIZZLE_S0_X_##ts0x | NV40TCL_TEX_SWIZZLE_S0_Y_##ts0y |         \
+   NV40TCL_TEX_SWIZZLE_S0_Z_##ts0z | NV40TCL_TEX_SWIZZLE_S0_W_##ts0w |         \
+   NV40TCL_TEX_SWIZZLE_S1_X_##ts1x | NV40TCL_TEX_SWIZZLE_S1_Y_##ts1y |         \
+   NV40TCL_TEX_SWIZZLE_S1_Z_##ts1z | NV40TCL_TEX_SWIZZLE_S1_W_##ts1w),         \
+  ((NV40TCL_TEX_FILTER_SIGNED_RED*sx) | (NV40TCL_TEX_FILTER_SIGNED_GREEN*sy) |       \
+   (NV40TCL_TEX_FILTER_SIGNED_BLUE*sz) | (NV40TCL_TEX_FILTER_SIGNED_ALPHA*sw))       \
+}
+
+struct nv40_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+	int     sign;
+};
+
+static struct nv40_texture_format
+nv40_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
+	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	{},
+};
+
+static struct nv40_texture_format *
+nv40_fragtex_format(uint pipe_format)
+{
+	struct nv40_texture_format *tf = nv40_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return NULL;
+}
+
+
+static struct nouveau_stateobj *
+nv40_fragtex_build(struct nv40_context *nv40, int unit)
+{
+	struct nv40_sampler_state *ps = nv40->tex_sampler[unit];
+	struct nv40_miptree *nv40mt = nv40->tex_miptree[unit];
+	struct pipe_texture *pt = &nv40mt->base;
+	struct nv40_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs, txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv40_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = ps->fmt;
+	txf |= tf->format | 0x8000;
+	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (1) /* XXX */
+		txf |= NV40TCL_TEX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV40TCL_TEX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv40mt->level[0].pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(16, 2);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_OFFSET(unit), 8);
+	so_reloc (so, nv40mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv40mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | tf->sign | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_SIZE1(unit), 1);
+	so_data  (so, (pt->depth[0] << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+
+	return so;
+}
+
+static boolean
+nv40_fragtex_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct nv40_state *state = &nv40->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(2, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_TEX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nv40->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = nv40_fragtex_build(nv40, unit);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	nv40->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_fragtex = {
+	.validate = nv40_fragtex_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SAMPLER | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_miptree.c b/src/gallium/drivers/nv40/nv40_miptree.c
new file mode 100644
index 0000000000..e4f8df910a
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -0,0 +1,237 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv40_context.h"
+
+static void
+nv40_miptree_layout(struct nv40_miptree *mt)
+{
+	struct pipe_texture *pt = &mt->base;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+	uint wide_pitch = pt->tex_usage & (PIPE_TEXTURE_USAGE_SAMPLER |
+		                           PIPE_TEXTURE_USAGE_DEPTH_STENCIL |
+		                           PIPE_TEXTURE_USAGE_RENDER_TARGET |
+		                           PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+		                           PIPE_TEXTURE_USAGE_PRIMARY);
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (wide_pitch && (pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
+			mt->level[l].pitch = align(pt->width[0] * pt->block.size, 64);
+		else
+			mt->level[l].pitch = pt->width[l] * pt->block.size;
+
+		mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l < pt->last_level; l++) {
+			mt->level[l].image_offset[f] = offset;
+
+			if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
+				offset += align(mt->level[l].pitch * pt->height[l], 64);
+			else
+				offset += mt->level[l].pitch * pt->height[l];
+		}
+
+		mt->level[l].image_offset[f] = offset;
+		offset += mt->level[l].pitch * pt->height[l];
+	}
+
+	mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv40_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_miptree *mt;
+	unsigned buf_usage = PIPE_BUFFER_USAGE_PIXEL |
+	                     NOUVEAU_BUFFER_USAGE_TEXTURE;
+
+	mt = MALLOC(sizeof(struct nv40_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->shadow_tex = NULL;
+	mt->shadow_surface = NULL;
+
+	/* Swizzled textures must be POT */
+	if (pt->width[0] & (pt->width[0] - 1) ||
+	    pt->height[0] & (pt->height[0] - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
+	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+	                     PIPE_TEXTURE_USAGE_DEPTH_STENCIL))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		{
+			if (debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE))
+				mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+			break;
+		}
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+
+	nv40_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, buf_usage, mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv40_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv40_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv40_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static void
+nv40_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	if (mt->shadow_tex) {
+		if (mt->shadow_surface)
+			pscreen->tex_surface_release(pscreen, &mt->shadow_surface);
+		nv40_miptree_release(pscreen, &mt->shadow_tex);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv40_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv40_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(ps);
+}
+
+void
+nv40_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv40_miptree_create;
+	pscreen->texture_blanket = nv40_miptree_blanket;
+	pscreen->texture_release = nv40_miptree_release;
+	pscreen->get_tex_surface = nv40_miptree_surface_new;
+	pscreen->tex_surface_release = nv40_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c
new file mode 100644
index 0000000000..9b9a43f49d
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_query.c
@@ -0,0 +1,122 @@
+#include "pipe/p_context.h"
+
+#include "nv40_context.h"
+
+struct nv40_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv40_query *
+nv40_query(struct pipe_query *pipe)
+{
+	return (struct nv40_query *)pipe;
+}
+
+static struct pipe_query *
+nv40_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv40_query *q;
+
+	q = CALLOC(1, sizeof(struct nv40_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv40_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	if (q->object)
+		nv40->nvws->res_free(&q->object);
+	FREE(q);
+}
+
+static void
+nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64_t tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nv40->nvws->res_alloc(nv40->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv40->nvws->notifier_reset(nv40->screen->query, q->object->start);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(curie, NV40TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv40_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV40TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(NULL);
+}
+
+static boolean
+nv40_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64_t *result)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv40->screen->query,
+					       q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv40->screen->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv40->screen->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv40_init_query_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_query = nv40_query_create;
+	nv40->pipe.destroy_query = nv40_query_destroy;
+	nv40->pipe.begin_query = nv40_query_begin;
+	nv40->pipe.end_query = nv40_query_end;
+	nv40->pipe.get_query_result = nv40_query_result;
+}
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
new file mode 100644
index 0000000000..2372bc8441
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -0,0 +1,383 @@
+#include "pipe/p_screen.h"
+#include "util/u_simple_screen.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
+#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
+
+static const char *
+nv40_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv40_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv40_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 1;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 4;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0; /* We have 4 - but unsupported currently */
+	case NOUVEAU_CAP_HW_VTXBUF:
+		return 1;
+	case NOUVEAU_CAP_HW_IDXBUF:
+		if (screen->curie->grclass == NV40TCL)
+			return 1;
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv40_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 16.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv40_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static struct pipe_buffer *
+nv40_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv40_miptree *mt = (struct nv40_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+static void *
+nv40_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_map;
+	void			*map;
+
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+
+		if (!mt->shadow_tex) {
+			unsigned old_tex_usage = surface->texture->tex_usage;
+			surface->texture->tex_usage = NOUVEAU_TEXTURE_USAGE_LINEAR |
+			                              PIPE_TEXTURE_USAGE_DYNAMIC;
+			mt->shadow_tex = screen->texture_create(screen, surface->texture);
+			surface->texture->tex_usage = old_tex_usage;
+
+			assert(mt->shadow_tex->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR);
+		}
+
+		mt->shadow_surface = screen->get_tex_surface
+		(
+			screen, mt->shadow_tex,
+			surface->face, surface->level, surface->zslice,
+			surface->usage
+		);
+
+		surface_to_map = mt->shadow_surface;
+	}
+	else
+		surface_to_map = surface;
+
+	assert(surface_to_map);
+	map = ws->buffer_map(ws, nv40_surface_buffer(surface_to_map), flags);
+	if (!map)
+		return NULL;
+
+	return map + surface_to_map->offset;
+}
+
+static void
+nv40_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_unmap;
+
+	/* TODO: Copy from shadow just before push buffer is flushed instead.
+	         There are probably some programs that map/unmap excessively
+	         before rendering. */
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+
+		assert(mt->shadow_tex);
+
+		surface_to_unmap = mt->shadow_surface;
+	}
+	else
+		surface_to_unmap = surface;
+
+	assert(surface_to_unmap);
+
+	ws->buffer_unmap(ws, nv40_surface_buffer(surface_to_unmap));
+
+	if (surface_to_unmap != surface) {
+		struct nv40_screen *nvscreen = nv40_screen(screen);
+
+		nvscreen->eng2d->copy(nvscreen->eng2d, surface, 0, 0,
+		                      surface_to_unmap, 0, 0,
+		                      surface->width, surface->height);
+
+		screen->tex_surface_release(screen, &surface_to_unmap);
+	}
+}
+
+static void
+nv40_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->res_free(&screen->vp_exec_heap);
+	nvws->res_free(&screen->vp_data_heap);
+	nvws->res_free(&screen->query_heap);
+	nvws->notifier_free(&screen->query);
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->curie);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv40_screen *screen = CALLOC_STRUCT(nv40_screen);
+	struct nouveau_stateobj *so;
+	unsigned curie_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv40_surface_buffer;
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x40:
+		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV40TCL;
+		else
+		if (NV4X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	case 0x60:
+		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	default:
+		break;
+	}
+
+	if (!curie_class) {
+		NOUVEAU_ERR("Unknown nv4x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, curie_class, &screen->curie);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&screen->vp_exec_heap, 0, 512) ||
+	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static curie initialisation */
+	so = so_new(128, 0);
+	so_method(so, screen->curie, NV40TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_UNK01AC, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR2, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+
+	so_method(so, screen->curie, 0x1ea4, 3);
+	so_data  (so, 0x00000010);
+	so_data  (so, 0x01000100);
+	so_data  (so, 0xff800006);
+
+	/* vtxprog output routing */
+	so_method(so, screen->curie, 0x1fc4, 1);
+	so_data  (so, 0x06144321);
+	so_method(so, screen->curie, 0x1fc8, 2);
+	so_data  (so, 0xedcba987);
+	so_data  (so, 0x00000021);
+	so_method(so, screen->curie, 0x1fd0, 1);
+	so_data  (so, 0x00171615);
+	so_method(so, screen->curie, 0x1fd4, 1);
+	so_data  (so, 0x001b1a19);
+
+	so_method(so, screen->curie, 0x1ef8, 1);
+	so_data  (so, 0x0020ffff);
+	so_method(so, screen->curie, 0x1d64, 1);
+	so_data  (so, 0x00d30000);
+	so_method(so, screen->curie, 0x1e94, 1);
+	so_data  (so, 0x00000001);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv40_screen_destroy;
+
+	screen->pipe.get_name = nv40_screen_get_name;
+	screen->pipe.get_vendor = nv40_screen_get_vendor;
+	screen->pipe.get_param = nv40_screen_get_param;
+	screen->pipe.get_paramf = nv40_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv40_screen_surface_format_supported;
+
+	screen->pipe.surface_map = nv40_surface_map;
+	screen->pipe.surface_unmap = nv40_surface_unmap;
+
+	nv40_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_screen.h b/src/gallium/drivers/nv40/nv40_screen.h
new file mode 100644
index 0000000000..4500aa0e5c
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.h
@@ -0,0 +1,37 @@
+#ifndef __NV40_SCREEN_H__
+#define __NV40_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04/nv04_surface_2d.h"
+
+struct nv40_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *curie;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NV40_STATE_MAX];
+};
+
+static INLINE struct nv40_screen *
+nv40_screen(struct pipe_screen *screen)
+{
+	return (struct nv40_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_shader.h b/src/gallium/drivers/nv40/nv40_shader.h
new file mode 100644
index 0000000000..854dccf548
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_shader.h
@@ -0,0 +1,556 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#    define NV40_VP_INST_COND_FL                                               0
+#    define NV40_VP_INST_COND_LT                                               1
+#    define NV40_VP_INST_COND_EQ                                               2
+#    define NV40_VP_INST_COND_LE                                               3
+#    define NV40_VP_INST_COND_GT                                               4
+#    define NV40_VP_INST_COND_NE                                               5
+#    define NV40_VP_INST_COND_GE                                               6
+#    define NV40_VP_INST_COND_TR                                               7
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_MUL                                             0x02
+#    define NV40_VP_INST_OP_ADD                                             0x03
+#    define NV40_VP_INST_OP_MAD                                             0x04
+#    define NV40_VP_INST_OP_DP3                                             0x05
+#    define NV40_VP_INST_OP_DPH                                             0x06
+#    define NV40_VP_INST_OP_DP4                                             0x07
+#    define NV40_VP_INST_OP_DST                                             0x08
+#    define NV40_VP_INST_OP_MIN                                             0x09
+#    define NV40_VP_INST_OP_MAX                                             0x0A
+#    define NV40_VP_INST_OP_SLT                                             0x0B
+#    define NV40_VP_INST_OP_SGE                                             0x0C
+#    define NV40_VP_INST_OP_ARL                                             0x0D
+#    define NV40_VP_INST_OP_FRC                                             0x0E
+#    define NV40_VP_INST_OP_FLR                                             0x0F
+#    define NV40_VP_INST_OP_SEQ                                             0x10
+#    define NV40_VP_INST_OP_SFL                                             0x11
+#    define NV40_VP_INST_OP_SGT                                             0x12
+#    define NV40_VP_INST_OP_SLE                                             0x13
+#    define NV40_VP_INST_OP_SNE                                             0x14
+#    define NV40_VP_INST_OP_STR                                             0x15
+#    define NV40_VP_INST_OP_SSG                                             0x16
+#    define NV40_VP_INST_OP_ARR                                             0x17
+#    define NV40_VP_INST_OP_ARA                                             0x18
+#    define NV40_VP_INST_OP_TXL                                             0x19
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_RCP                                             0x02
+#    define NV40_VP_INST_OP_RCC                                             0x03
+#    define NV40_VP_INST_OP_RSQ                                             0x04
+#    define NV40_VP_INST_OP_EXP                                             0x05
+#    define NV40_VP_INST_OP_LOG                                             0x06
+#    define NV40_VP_INST_OP_LIT                                             0x07
+#    define NV40_VP_INST_OP_BRA                                             0x09
+#    define NV40_VP_INST_OP_CAL                                             0x0B
+#    define NV40_VP_INST_OP_RET                                             0x0C
+#    define NV40_VP_INST_OP_LG2                                             0x0D
+#    define NV40_VP_INST_OP_EX2                                             0x0E
+#    define NV40_VP_INST_OP_SIN                                             0x0F
+#    define NV40_VP_INST_OP_COS                                             0x10
+#    define NV40_VP_INST_OP_PUSHA                                           0x13
+#    define NV40_VP_INST_OP_POPA                                            0x14
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#    define NV40_VP_INST_IN_POS                                                0
+#    define NV40_VP_INST_IN_WEIGHT                                             1
+#    define NV40_VP_INST_IN_NORMAL                                             2
+#    define NV40_VP_INST_IN_COL0                                               3
+#    define NV40_VP_INST_IN_COL1                                               4
+#    define NV40_VP_INST_IN_FOGC                                               5
+#    define NV40_VP_INST_IN_TC0                                                8
+#    define NV40_VP_INST_IN_TC(n)                                          (8+n)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST_LAST                                               (1 << 0)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *         0 - Opcode, output reg/mask, ATTRIB source
+ *         1 - Source 0
+ *         2 - Source 1
+ *         3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *                 result.color == R0.xyzw
+ *                 result.depth == R1.z
+ * When the fragprog contains instructions to write depth,
+ * NV30_TCL_PRIMITIVE_3D_UNK1D78=0 otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *                 ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO and
+ * SWIZZLE_ONE.
+ *
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as
+ * SWIZZLE_ZERO is implemented simply by not writing to the relevant components
+ * of the destination.
+ *
+ * Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *         LIT
+ *         LRP - MAD+MAD
+ *         SUB - ADD, negate second source
+ *         RSQ - LG2 + EX2
+ *         POW - LG2 + MUL + EX2
+ *         SCS - COS + SIN
+ *         XPD
+ *         DP2 - MUL + ADD
+ *         NRM
+ */
+
+//== Opcode / Destination selection ==
+#define NV40_FP_OP_PROGRAM_END                                          (1 << 0)
+#define NV40_FP_OP_OUT_REG_SHIFT                                               1
+#define NV40_FP_OP_OUT_REG_MASK                                        (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV40_FP_OP_OUT_REG_HALF                                         (1 << 7)
+#define NV40_FP_OP_COND_WRITE_ENABLE                                    (1 << 8)
+#define NV40_FP_OP_OUTMASK_SHIFT                                               9
+#define NV40_FP_OP_OUTMASK_MASK                                       (0xF << 9)
+#    define NV40_FP_OP_OUT_X                                            (1 << 9)
+#    define NV40_FP_OP_OUT_Y                                            (1 <<10)
+#    define NV40_FP_OP_OUT_Z                                            (1 <<11)
+#    define NV40_FP_OP_OUT_W                                            (1 <<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV40_FP_OP_INPUT_SRC_SHIFT                                            13
+#define NV40_FP_OP_INPUT_SRC_MASK                                     (15 << 13)
+#    define NV40_FP_OP_INPUT_SRC_POSITION                                    0x0
+#    define NV40_FP_OP_INPUT_SRC_COL0                                        0x1
+#    define NV40_FP_OP_INPUT_SRC_COL1                                        0x2
+#    define NV40_FP_OP_INPUT_SRC_FOGC                                        0x3
+#    define NV40_FP_OP_INPUT_SRC_TC0                                         0x4
+#    define NV40_FP_OP_INPUT_SRC_TC(n)                                 (0x4 + n)
+#    define NV40_FP_OP_INPUT_SRC_FACING                                      0xE
+#define NV40_FP_OP_TEX_UNIT_SHIFT                                             17
+#define NV40_FP_OP_TEX_UNIT_MASK                                     (0xF << 17)
+#define NV40_FP_OP_PRECISION_SHIFT                                            22
+#define NV40_FP_OP_PRECISION_MASK                                      (3 << 22)
+#   define NV40_FP_PRECISION_FP32                                              0
+#   define NV40_FP_PRECISION_FP16                                              1
+#   define NV40_FP_PRECISION_FX12                                              2
+#define NV40_FP_OP_OPCODE_SHIFT                                               24
+#define NV40_FP_OP_OPCODE_MASK                                      (0x3F << 24)
+#        define NV40_FP_OP_OPCODE_NOP                                       0x00
+#        define NV40_FP_OP_OPCODE_MOV                                       0x01
+#        define NV40_FP_OP_OPCODE_MUL                                       0x02
+#        define NV40_FP_OP_OPCODE_ADD                                       0x03
+#        define NV40_FP_OP_OPCODE_MAD                                       0x04
+#        define NV40_FP_OP_OPCODE_DP3                                       0x05
+#        define NV40_FP_OP_OPCODE_DP4                                       0x06
+#        define NV40_FP_OP_OPCODE_DST                                       0x07
+#        define NV40_FP_OP_OPCODE_MIN                                       0x08
+#        define NV40_FP_OP_OPCODE_MAX                                       0x09
+#        define NV40_FP_OP_OPCODE_SLT                                       0x0A
+#        define NV40_FP_OP_OPCODE_SGE                                       0x0B
+#        define NV40_FP_OP_OPCODE_SLE                                       0x0C
+#        define NV40_FP_OP_OPCODE_SGT                                       0x0D
+#        define NV40_FP_OP_OPCODE_SNE                                       0x0E
+#        define NV40_FP_OP_OPCODE_SEQ                                       0x0F
+#        define NV40_FP_OP_OPCODE_FRC                                       0x10
+#        define NV40_FP_OP_OPCODE_FLR                                       0x11
+#        define NV40_FP_OP_OPCODE_KIL                                       0x12
+#        define NV40_FP_OP_OPCODE_PK4B                                      0x13
+#        define NV40_FP_OP_OPCODE_UP4B                                      0x14
+/* DDX/DDY can only write to XY */
+#        define NV40_FP_OP_OPCODE_DDX                                       0x15
+#        define NV40_FP_OP_OPCODE_DDY                                       0x16
+#        define NV40_FP_OP_OPCODE_TEX                                       0x17
+#        define NV40_FP_OP_OPCODE_TXP                                       0x18
+#        define NV40_FP_OP_OPCODE_TXD                                       0x19
+#        define NV40_FP_OP_OPCODE_RCP                                       0x1A
+#        define NV40_FP_OP_OPCODE_EX2                                       0x1C
+#        define NV40_FP_OP_OPCODE_LG2                                       0x1D
+#        define NV40_FP_OP_OPCODE_STR                                       0x20
+#        define NV40_FP_OP_OPCODE_SFL                                       0x21
+#        define NV40_FP_OP_OPCODE_COS                                       0x22
+#        define NV40_FP_OP_OPCODE_SIN                                       0x23
+#        define NV40_FP_OP_OPCODE_PK2H                                      0x24
+#        define NV40_FP_OP_OPCODE_UP2H                                      0x25
+#        define NV40_FP_OP_OPCODE_PK4UB                                     0x27
+#        define NV40_FP_OP_OPCODE_UP4UB                                     0x28
+#        define NV40_FP_OP_OPCODE_PK2US                                     0x29
+#        define NV40_FP_OP_OPCODE_UP2US                                     0x2A
+#        define NV40_FP_OP_OPCODE_DP2A                                      0x2E
+#        define NV40_FP_OP_OPCODE_TXL                                       0x2F
+#        define NV40_FP_OP_OPCODE_TXB                                       0x31
+#        define NV40_FP_OP_OPCODE_DIV                                       0x3A
+#        define NV40_FP_OP_OPCODE_UNK_LIT                                   0x3C
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#        define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#        define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#        define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#        define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#        define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#        define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+#define NV40_FP_OP_OUT_SAT                                             (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV40_FP_OP_OUT_ABS                                             (1 << 29)
+#define NV40_FP_OP_COND_SWZ_W_SHIFT                                           27
+#define NV40_FP_OP_COND_SWZ_W_MASK                                     (3 << 27)
+#define NV40_FP_OP_COND_SWZ_Z_SHIFT                                           25
+#define NV40_FP_OP_COND_SWZ_Z_MASK                                     (3 << 25)
+#define NV40_FP_OP_COND_SWZ_Y_SHIFT                                           23
+#define NV40_FP_OP_COND_SWZ_Y_MASK                                     (3 << 23)
+#define NV40_FP_OP_COND_SWZ_X_SHIFT                                           21
+#define NV40_FP_OP_COND_SWZ_X_MASK                                     (3 << 21)
+#define NV40_FP_OP_COND_SWZ_ALL_SHIFT                                         21
+#define NV40_FP_OP_COND_SWZ_ALL_MASK                                (0xFF << 21)
+#define NV40_FP_OP_COND_SHIFT                                                 18
+#define NV40_FP_OP_COND_MASK                                        (0x07 << 18)
+#        define NV40_FP_OP_COND_FL                                             0
+#        define NV40_FP_OP_COND_LT                                             1
+#        define NV40_FP_OP_COND_EQ                                             2
+#        define NV40_FP_OP_COND_LE                                             3
+#        define NV40_FP_OP_COND_GT                                             4
+#        define NV40_FP_OP_COND_NE                                             5
+#        define NV40_FP_OP_COND_GE                                             6
+#        define NV40_FP_OP_COND_TR                                             7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NV40_FP_OP_DST_SCALE_SHIFT                                            28
+#define NV40_FP_OP_DST_SCALE_MASK                                      (3 << 28)
+#define NV40_FP_OP_DST_SCALE_1X                                                0
+#define NV40_FP_OP_DST_SCALE_2X                                                1
+#define NV40_FP_OP_DST_SCALE_4X                                                2
+#define NV40_FP_OP_DST_SCALE_8X                                                3
+#define NV40_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV40_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV40_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+// SRC2 high-order
+#define NV40_FP_OP_INDEX_INPUT                                         (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT                                           19
+#define NV40_FP_OP_ADDR_INDEX_MASK                                   (0xF << 19)
+
+//== Register selection ==
+#define NV40_FP_REG_TYPE_SHIFT                                                 0
+#define NV40_FP_REG_TYPE_MASK                                           (3 << 0)
+#        define NV40_FP_REG_TYPE_TEMP                                          0
+#        define NV40_FP_REG_TYPE_INPUT                                         1
+#        define NV40_FP_REG_TYPE_CONST                                         2
+#define NV40_FP_REG_SRC_SHIFT                                                  2
+#define NV40_FP_REG_SRC_MASK                                           (63 << 2)
+#define NV40_FP_REG_SRC_HALF                                            (1 << 8)
+#define NV40_FP_REG_SWZ_ALL_SHIFT                                              9
+#define NV40_FP_REG_SWZ_ALL_MASK                                      (255 << 9)
+#define NV40_FP_REG_SWZ_X_SHIFT                                                9
+#define NV40_FP_REG_SWZ_X_MASK                                          (3 << 9)
+#define NV40_FP_REG_SWZ_Y_SHIFT                                               11
+#define NV40_FP_REG_SWZ_Y_MASK                                         (3 << 11)
+#define NV40_FP_REG_SWZ_Z_SHIFT                                               13
+#define NV40_FP_REG_SWZ_Z_MASK                                         (3 << 13)
+#define NV40_FP_REG_SWZ_W_SHIFT                                               15
+#define NV40_FP_REG_SWZ_W_MASK                                         (3 << 15)
+#        define NV40_FP_SWIZZLE_X                                              0
+#        define NV40_FP_SWIZZLE_Y                                              1
+#        define NV40_FP_SWIZZLE_Z                                              2
+#        define NV40_FP_SWIZZLE_W                                              3
+#define NV40_FP_REG_NEGATE                                             (1 << 17)
+
+#ifndef NV40_SHADER_NO_FUCKEDNESS
+#define NV40SR_NONE	0
+#define NV40SR_OUTPUT	1
+#define NV40SR_INPUT	2
+#define NV40SR_TEMP	3
+#define NV40SR_CONST	4
+
+struct nv40_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv40_sreg
+nv40_sr(int type, int index)
+{
+	struct nv40_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_swz(struct nv40_sreg src, int x, int y, int z, int w)
+{
+	struct nv40_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_neg(struct nv40_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_abs(struct nv40_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_scale(struct nv40_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
new file mode 100644
index 0000000000..2eff25aa83
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -0,0 +1,740 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+static void *
+nv40_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		so_method(so, curie, NV40TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	if (cso->logicop_enable) {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nv40_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend = hwcso;
+	nv40->dirty |= NV40_NEW_BLEND;
+}
+
+static void
+nv40_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
+		break;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV40TCL_TEX_WRAP_S_SHIFT;
+}
+
+static void *
+nv40_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv40_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv40_sampler_state));
+
+	ps->fmt = 0;
+	if (!cso->normalized_coords)
+		ps->fmt |= NV40TCL_TEX_FORMAT_RECT;
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV40TCL_TEX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV40TCL_TEX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV40TCL_TEX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy >= 2.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
+		}
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV40TCL_TEX_FILTER_MAG_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV40TCL_TEX_FILTER_MAG_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 7;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 19;
+	}
+
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv40_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv40->tex_sampler[unit] = sampler[unit];
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_samplers; unit++) {
+		nv40->tex_sampler[unit] = NULL;
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_samplers = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void
+nv40_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv40_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], miptree[unit]);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_textures; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], NULL);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_textures = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void *
+nv40_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, curie, NV40TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV40TCL_SHADE_MODEL_FLAT :
+				       NV40TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, curie, NV40TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, curie, NV40TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, curie, NV40TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, curie, NV40TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, curie, NV40TCL_POINT_SPRITE, 1);
+	if (cso->point_sprite) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->rasterizer = hwcso;
+	nv40->dirty |= NV40_NEW_RAST;
+	nv40->draw_dirty |= NV40_NEW_RAST;
+}
+
+static void
+nv40_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	so_method(so, curie, NV40TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref_value));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nv40_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->zsa = hwcso;
+	nv40->dirty |= NV40_NEW_ZSA;
+}
+
+static void
+nv40_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nv40_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nv40_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	vp->draw = draw_create_vertex_shader(nv40->draw, &vp->pipe);
+
+	return (void *)vp;
+}
+
+static void
+nv40_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->vertprog = hwcso;
+	nv40->dirty |= NV40_NEW_VERTPROG;
+	nv40->draw_dirty |= NV40_NEW_VERTPROG;
+}
+
+static void
+nv40_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp = hwcso;
+
+	draw_delete_vertex_shader(nv40->draw, vp->draw);
+	nv40_vertprog_destroy(nv40, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nv40_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv40_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv40_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->fragprog = hwcso;
+	nv40->dirty |= NV40_NEW_FRAGPROG;
+}
+
+static void
+nv40_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_fragment_program *fp = hwcso;
+
+	nv40_fragprog_destroy(nv40, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv40_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend_colour = *bcol;
+	nv40->dirty |= NV40_NEW_BCOL;
+}
+
+static void
+nv40_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->clip = *clip;
+	nv40->dirty |= NV40_NEW_UCP;
+	nv40->draw_dirty |= NV40_NEW_UCP;
+}
+
+static void
+nv40_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->constbuf[shader] = buf->buffer;
+	nv40->constbuf_nr[shader] = buf->buffer->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv40->dirty |= NV40_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv40->dirty |= NV40_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv40_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->framebuffer = *fb;
+	nv40->dirty |= NV40_NEW_FB;
+}
+
+static void
+nv40_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->stipple, stipple->stipple, 4 * 32);
+	nv40->dirty |= NV40_NEW_STIPPLE;
+}
+
+static void
+nv40_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->scissor = *s;
+	nv40->dirty |= NV40_NEW_SCISSOR;
+}
+
+static void
+nv40_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->viewport = *vpt;
+	nv40->dirty |= NV40_NEW_VIEWPORT;
+	nv40->draw_dirty |= NV40_NEW_VIEWPORT;
+}
+
+static void
+nv40_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxbuf, vb, sizeof(*vb) * count);
+	nv40->vtxbuf_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxelt, ve, sizeof(*ve) * count);
+	nv40->vtxelt_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->edgeflags = bitfield;
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+void
+nv40_init_state_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_blend_state = nv40_blend_state_create;
+	nv40->pipe.bind_blend_state = nv40_blend_state_bind;
+	nv40->pipe.delete_blend_state = nv40_blend_state_delete;
+
+	nv40->pipe.create_sampler_state = nv40_sampler_state_create;
+	nv40->pipe.bind_sampler_states = nv40_sampler_state_bind;
+	nv40->pipe.delete_sampler_state = nv40_sampler_state_delete;
+	nv40->pipe.set_sampler_textures = nv40_set_sampler_texture;
+
+	nv40->pipe.create_rasterizer_state = nv40_rasterizer_state_create;
+	nv40->pipe.bind_rasterizer_state = nv40_rasterizer_state_bind;
+	nv40->pipe.delete_rasterizer_state = nv40_rasterizer_state_delete;
+
+	nv40->pipe.create_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_create;
+	nv40->pipe.bind_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_bind;
+	nv40->pipe.delete_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_delete;
+
+	nv40->pipe.create_vs_state = nv40_vp_state_create;
+	nv40->pipe.bind_vs_state = nv40_vp_state_bind;
+	nv40->pipe.delete_vs_state = nv40_vp_state_delete;
+
+	nv40->pipe.create_fs_state = nv40_fp_state_create;
+	nv40->pipe.bind_fs_state = nv40_fp_state_bind;
+	nv40->pipe.delete_fs_state = nv40_fp_state_delete;
+
+	nv40->pipe.set_blend_color = nv40_set_blend_color;
+	nv40->pipe.set_clip_state = nv40_set_clip_state;
+	nv40->pipe.set_constant_buffer = nv40_set_constant_buffer;
+	nv40->pipe.set_framebuffer_state = nv40_set_framebuffer_state;
+	nv40->pipe.set_polygon_stipple = nv40_set_polygon_stipple;
+	nv40->pipe.set_scissor_state = nv40_set_scissor_state;
+	nv40->pipe.set_viewport_state = nv40_set_viewport_state;
+
+	nv40->pipe.set_edgeflags = nv40_set_edgeflags;
+	nv40->pipe.set_vertex_buffers = nv40_set_vertex_buffers;
+	nv40->pipe.set_vertex_elements = nv40_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state.h b/src/gallium/drivers/nv40/nv40_state.h
new file mode 100644
index 0000000000..9c55903ae3
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.h
@@ -0,0 +1,91 @@
+#ifndef __NV40_STATE_H__
+#define __NV40_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv40_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv40_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv40_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv40_vertex_program {
+	struct pipe_shader_state pipe;
+
+	struct draw_vertex_shader *draw;
+
+	boolean translated;
+
+	struct pipe_clip_state ucp;
+
+	struct nv40_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv40_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	uint32_t clip_ctrl;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv40_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv40_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct pipe_texture *shadow_tex;
+	struct pipe_surface *shadow_surface;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state_blend.c b/src/gallium/drivers/nv40/nv40_state_blend.c
new file mode 100644
index 0000000000..95e6d7394f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_blend.c
@@ -0,0 +1,40 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_blend_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->blend->so, &nv40->state.hw[NV40_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend = {
+	.validate = nv40_state_blend_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BLEND,
+		.hw = NV40_STATE_BLEND
+	}
+};
+
+static boolean
+nv40_state_blend_colour_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *so = so_new(2, 0);
+	struct pipe_blend_color *bcol = &nv40->blend_colour;
+
+	so_method(so, nv40->screen->curie, NV40TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_BCOL]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend_colour = {
+	.validate = nv40_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BCOL,
+		.hw = NV40_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
new file mode 100644
index 0000000000..ce859def10
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -0,0 +1,184 @@
+#include "nv40_context.h"
+#include "nv40_state.h"
+#include "draw/draw_context.h"
+
+static struct nv40_state_entry *render_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vbo,
+	NULL
+};
+
+static struct nv40_state_entry *swtnl_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vtxfmt,
+	NULL
+};
+
+static void
+nv40_state_do_validate(struct nv40_context *nv40,
+		       struct nv40_state_entry **states)
+{
+	const struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	unsigned i;
+
+	for (i = 0; i < fb->nr_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	while (*states) {
+		struct nv40_state_entry *e = *states;
+
+		if (nv40->dirty & e->dirty.pipe) {
+			if (e->validate(nv40))
+				nv40->state.dirty |= (1ULL << e->dirty.hw);
+		}
+
+		states++;
+	}
+	nv40->dirty = 0;
+}
+
+void
+nv40_state_emit(struct nv40_context *nv40)
+{
+	struct nv40_state *state = &nv40->state;
+	struct nv40_screen *screen = nv40->screen;
+	unsigned i, samplers;
+	uint64_t states;
+
+	if (nv40->pctx_id != screen->cur_pctx) {
+		for (i = 0; i < NV40_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_pctx = nv40->pctx_id;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv40->screen->state[i]);
+		if (state->hw[i])
+			so_emit(nv40->nvws, nv40->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) |
+			    (1ULL << NV40_STATE_FRAGTEX0))) {
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (1);
+	}
+
+	state->dirty = 0;
+
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(nv40->nvws,
+				      state->hw[NV40_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FRAGPROG]);
+	if (state->hw[NV40_STATE_VTXBUF] && nv40->render_mode == HW)
+		so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]);
+}
+
+boolean
+nv40_state_validate(struct nv40_context *nv40)
+{
+	boolean was_sw = nv40->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv40->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv40->fallback_swtnl & nv40->dirty)
+				!= nv40->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = HW;
+	}
+
+	nv40_state_do_validate(nv40, render_states);
+	if (nv40->fallback_swtnl || nv40->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+
+	return TRUE;
+}
+
+boolean
+nv40_state_validate_swtnl(struct nv40_context *nv40)
+{
+	struct draw_context *draw = nv40->draw;
+
+	/* Setup for swtnl */
+	if (nv40->render_mode == HW) {
+		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nv40->fallback_swtnl);
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = SWTNL;
+	}
+
+	if (nv40->draw_dirty & NV40_NEW_VERTPROG)
+		draw_bind_vertex_shader(draw, nv40->vertprog->draw);
+
+	if (nv40->draw_dirty & NV40_NEW_RAST)
+		draw_set_rasterizer_state(draw, &nv40->rasterizer->pipe);
+
+	if (nv40->draw_dirty & NV40_NEW_UCP)
+		draw_set_clip_state(draw, &nv40->clip);
+
+	if (nv40->draw_dirty & NV40_NEW_VIEWPORT)
+		draw_set_viewport_state(draw, &nv40->viewport);
+
+	if (nv40->draw_dirty & NV40_NEW_ARRAYS) {
+		draw_set_edgeflags(draw, nv40->edgeflags);
+		draw_set_vertex_buffers(draw, nv40->vtxbuf_nr, nv40->vtxbuf);
+		draw_set_vertex_elements(draw, nv40->vtxelt_nr, nv40->vtxelt);	
+	}
+
+	nv40_state_do_validate(nv40, swtnl_states);
+	if (nv40->fallback_swrast) {
+		NOUVEAU_ERR("swtnl->swrast 0x%08x\n", nv40->fallback_swrast);
+		return FALSE;
+	}
+
+	nv40->draw_dirty = 0;
+	return TRUE;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
new file mode 100644
index 0000000000..454abad31f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -0,0 +1,162 @@
+#include "nv40_context.h"
+#include "nouveau/nouveau_util.h"
+
+static struct pipe_buffer *
+nv40_surface_buffer(struct pipe_surface *surface)
+{
+	struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+	return mt->buffer;
+}
+
+static boolean
+nv40_state_framebuffer_validate(struct nv40_context *nv40)
+{
+	struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	struct pipe_surface *rt[4], *zeta;
+	uint32_t rt_enable, rt_format;
+	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+
+	rt_enable = 0;
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV40TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & (NV40TCL_RT_ENABLE_COLOR1 | NV40TCL_RT_ENABLE_COLOR2 |
+			 NV40TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV40TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	if (!(rt[0]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+		for (i = 1; i < fb->nr_cbufs; i++)
+			assert(!(rt[i]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
+
+		rt_format = NV40TCL_RT_FORMAT_TYPE_SWIZZLED |
+		            log2i(fb->width) << NV40TCL_RT_FORMAT_LOG2_WIDTH_SHIFT |
+		            log2i(fb->height) << NV40TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT;
+	}
+	else
+		rt_format = NV40TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR0, 1);
+		so_reloc (so, nv40_surface_buffer(rt[0]), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR0_PITCH, 2);
+		so_data  (so, rt[0]->stride);
+		so_reloc (so, nv40_surface_buffer(rt[0]), rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR1, 1);
+		so_reloc (so, nv40_surface_buffer(rt[1]), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, nv40_surface_buffer(rt[1]), rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR2, 1);
+		so_reloc (so, nv40_surface_buffer(rt[2]), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_OFFSET, 1);
+		so_reloc (so, nv40_surface_buffer(rt[2]), rt[2]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_PITCH, 1);
+		so_data  (so, rt[2]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR3, 1);
+		so_reloc (so, nv40_surface_buffer(rt[3]), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_OFFSET, 1);
+		so_reloc (so, nv40_surface_buffer(rt[3]), rt[3]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_PITCH, 1);
+		so_data  (so, rt[3]->stride);
+	}
+
+	if (zeta_format) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_ZETA, 1);
+		so_reloc (so, nv40_surface_buffer(zeta), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_OFFSET, 1);
+		so_reloc (so, nv40_surface_buffer(zeta), zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_PITCH, 1);
+		so_data  (so, zeta->stride);
+	}
+
+	so_method(so, nv40->screen->curie, NV40TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv40->screen->curie, NV40TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, nv40->screen->curie, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_FB]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_framebuffer = {
+	.validate = nv40_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FB,
+		.hw = NV40_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_rasterizer.c b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
new file mode 100644
index 0000000000..9ecda5990f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_rasterizer_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->rasterizer->so,
+	       &nv40->state.hw[NV40_STATE_RAST]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_rasterizer = {
+	.validate = nv40_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_RAST,
+		.hw = NV40_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_scissor.c b/src/gallium/drivers/nv40/nv40_state_scissor.c
new file mode 100644
index 0000000000..285239ef41
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_scissor.c
@@ -0,0 +1,35 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_scissor_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nv40->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nv40->state.scissor_enabled == 0))
+		return FALSE;
+	nv40->state.scissor_enabled = rast->scissor;
+
+	so = so_new(3, 0);
+	so_method(so, nv40->screen->curie, NV40TCL_SCISSOR_HORIZ, 2);
+	if (nv40->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_SCISSOR]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_scissor = {
+	.validate = nv40_state_scissor_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SCISSOR | NV40_NEW_RAST,
+		.hw = NV40_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_stipple.c b/src/gallium/drivers/nv40/nv40_state_stipple.c
new file mode 100644
index 0000000000..b51024ad9b
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_stipple.c
@@ -0,0 +1,39 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_stipple_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nv40->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(35, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv40->stipple[i]);
+	} else {
+		so = so_new(2, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_STIPPLE]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_stipple = {
+	.validate = nv40_state_stipple_validate,
+	.dirty = {
+		.pipe = NV40_NEW_STIPPLE | NV40_NEW_RAST,
+		.hw = NV40_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c
new file mode 100644
index 0000000000..869a55b405
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_viewport.c
@@ -0,0 +1,67 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_viewport_validate(struct nv40_context *nv40)
+{
+	struct pipe_viewport_state *vpt = &nv40->viewport;
+	struct nouveau_stateobj *so;
+	unsigned bypass;
+
+	if (nv40->render_mode == HW && !nv40->rasterizer->pipe.bypass_clipping)
+		bypass = 0;
+	else
+		bypass = 1;
+
+	if (nv40->state.hw[NV40_STATE_VIEWPORT] &&
+	    (bypass || !(nv40->dirty & NV40_NEW_VIEWPORT)) &&
+	    nv40->state.viewport_bypass == bypass)
+		return FALSE;
+	nv40->state.viewport_bypass = bypass;
+
+	so = so_new(11, 0);
+	if (!bypass) {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 1);
+	} else {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 0x110);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_VIEWPORT]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_viewport = {
+	.validate = nv40_state_viewport_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VIEWPORT | NV40_NEW_RAST,
+		.hw = NV40_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_zsa.c b/src/gallium/drivers/nv40/nv40_state_zsa.c
new file mode 100644
index 0000000000..fb760677c8
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_zsa.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_zsa_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->zsa->so,
+	       &nv40->state.hw[NV40_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_zsa = {
+	.validate = nv40_state_zsa_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ZSA,
+		.hw = NV40_STATE_ZSA
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_surface.c b/src/gallium/drivers/nv40/nv40_surface.c
new file mode 100644
index 0000000000..c4a5fb20d9
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv40_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv40_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv04_surface_2d *eng2d = nv40->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv40_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv04_surface_2d *eng2d = nv40->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv40_init_surface_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.surface_copy = nv40_surface_copy;
+	nv40->pipe.surface_fill = nv40_surface_fill;
+}
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
new file mode 100644
index 0000000000..8f1834628f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -0,0 +1,555 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+#define FORCE_SWTNL 0
+
+static INLINE int
+nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV40TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV40TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV40TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nv40_vbo_set_idxbuf(struct nv40_context *nv40, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nv40->screen->pipe;
+	unsigned type;
+
+	if (!ib) {
+		nv40->idxbuf = NULL;
+		nv40->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nv40->idxbuf ||
+	    type != nv40->idxbuf_format) {
+		nv40->dirty |= NV40_NEW_ARRAYS;
+		nv40->idxbuf = ib;
+		nv40->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV40TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, curie, NV40TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, curie, NV40TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, curie, NV40TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, curie, NV40TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	nv40_vbo_set_idxbuf(nv40, NULL, 0);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static boolean
+nv40_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nv40_draw_elements_u08(nv40, map, mode, start, count);
+		break;
+	case 2:
+		nv40_draw_elements_u16(nv40, map, mode, start, count);
+		break;
+	case 4:
+		nv40_draw_elements_u32(nv40, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	ws->buffer_unmap(ws, ib);
+	return TRUE;
+}
+
+static boolean
+nv40_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	if (idxbuf) {
+		nv40_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct pipe_buffer *ib = nv40->idxbuf;
+	unsigned ib_format = nv40->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	if (nv40->edgeflags) {
+		nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+		return FALSE;
+	}
+
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, curie, NV40TCL_VTXBUF_ADDRESS(0), nv40->vtxelt_nr);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, curie, NV40TCL_VTXFMT(0), nv40->vtxelt_nr);
+
+	for (hw = 0; hw < nv40->vtxelt_nr; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nv40->vtxelt[hw];
+		vb = &nv40->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->stride) {
+			if (!sattr)
+				sattr = so_new(16 * 5, 0);
+
+			if (nv40_vbo_static_attrib(nv40, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV40TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV40TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->stride << NV40TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV40TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		so_method(vtxbuf, curie, NV40TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV40TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, curie, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nv40->state.hw[NV40_STATE_VTXBUF]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXBUF);
+	so_ref(vtxfmt, &nv40->state.hw[NV40_STATE_VTXFMT]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXFMT);
+	so_ref(sattr, &nv40->state.hw[NV40_STATE_VTXATTR]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vbo = {
+	.validate = nv40_vbo_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
new file mode 100644
index 0000000000..0862386638
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -0,0 +1,1070 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+
+#define NV40_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+
+struct nv40_vpc {
+	struct nv40_vertex_program *vp;
+
+	struct nv40_vertex_program_exec *vpi;
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_address;
+	struct nv40_sreg *r_temp;
+
+	struct nv40_sreg *imm;
+	unsigned nr_imm;
+
+	unsigned hpos_idx;
+};
+
+static struct nv40_sreg
+temp(struct nv40_vpc *vpc)
+{
+	int idx = ffs(~vpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	vpc->r_temps |= (1 << idx);
+	vpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_vpc *vpc)
+{
+	vpc->r_temps &= ~vpc->r_temps_discard;
+	vpc->r_temps_discard = 0;
+}
+
+static struct nv40_sreg
+constant(struct nv40_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	struct nv40_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv40_sr(NV40SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_vp_arith((cc), (s), NV40_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv40_vpc *vpc, uint32_t *hw, int pos, struct nv40_sreg src)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_TEMP:
+		sr |= (NV40_VP_SRC_REG_TYPE_TEMP << NV40_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV40SR_INPUT:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV40_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		sr |= (NV40_VP_SRC_REG_TYPE_CONST <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV40_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_VP_SRC_SWZ_W_SHIFT));
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV40_VP_SRC0_HIGH_MASK) >>
+			  NV40_VP_SRC0_HIGH_SHIFT) << NV40_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV40_VP_SRC0_LOW_MASK) <<
+			  NV40_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV40_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV40_VP_SRC2_HIGH_MASK) >>
+			  NV40_VP_SRC2_HIGH_SHIFT) << NV40_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV40_VP_SRC2_LOW_MASK) <<
+			  NV40_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv40_vpc *vpc, uint32_t *hw, int slot, struct nv40_sreg dst)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		hw[3] |= NV40_VP_INST_DEST_MASK;
+		if (slot == 0) {
+			hw[0] |= (dst.index <<
+				  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+		} else {
+			hw[3] |= (dst.index << 
+				  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+		}
+		break;
+	case NV40SR_OUTPUT:
+		switch (dst.index) {
+		case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV40_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		case NV40_VP_INST_DEST_CLIP(0):
+			vp->or |= (1 << 6);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE0;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(1):
+			vp->or |= (1 << 7);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE1;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(2):
+			vp->or |= (1 << 8);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE2;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(3):
+			vp->or |= (1 << 9);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE3;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(4):
+			vp->or |= (1 << 10);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE4;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(5):
+			vp->or |= (1 << 11);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE5;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+		if (slot == 0) {
+			hw[0] |= NV40_VP_INST_VEC_RESULT;
+			hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+		} else {
+			hw[3] |= NV40_VP_INST_SCA_RESULT;
+			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv40_vp_arith(struct nv40_vpc *vpc, int slot, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1,
+	      struct nv40_sreg s2)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV40_VP_INST_COND_TR << NV40_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV40_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV40_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV40_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV40_VP_INST_COND_SWZ_W_SHIFT));
+
+	if (slot == 0) {
+		hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+		hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+	} else {
+		hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+		hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+		hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = vpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv40_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = vpc->r_result[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = vpc->r_temp[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_ADDRESS:
+		dst = vpc->r_address[fdst->DstRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(vpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= tgsi_mask(1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= tgsi_mask(1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(vpc);
+
+	if (mask)
+		arith(vpc, 0, OP_MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(vpc, 0, OP_SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(vpc, 0, OP_STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(vpc);
+		arith(vpc, 0, OP_STR, one, neg_mask, one, none, none);
+		arith(vpc, 0, OP_MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv40_sreg src[3], dst, tmp;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(vpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, abs(src[0]));
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(vpc);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_VP_INST_DEST_POS;
+		vpc->hpos_idx = idx;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV40_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_prepare(struct nv40_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#if 0 /* this would be nice.. except gallium doesn't track it */
+			case TGSI_FILE_ADDRESS:
+				if (fdec->DeclarationRange.Last > high_addr) {
+					high_addr =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#endif
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_vertprog_parse_decl_output(vpc, fdec))
+					return FALSE;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+#if 1 /* yay, parse instructions looking for address regs instead */
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			const struct tgsi_full_dst_register *fdst;
+
+			finst = &p.FullToken.FullInstruction;
+			fdst = &finst->FullDstRegisters[0];
+
+			if (fdst->DstRegister.File == TGSI_FILE_ADDRESS) {
+				if (fdst->DstRegister.Index > high_addr)
+					high_addr = fdst->DstRegister.Index;
+			}
+		
+		}
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv40_sreg));
+		assert(vpc->imm);
+	}
+
+	if (++high_temp) {
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			vpc->r_temp[i] = temp(vpc);
+	}
+
+	if (++high_addr) {
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_addr; i++)
+			vpc->r_address[i] = temp(vpc);
+	}
+
+	vpc->r_temps_discard = 0;
+	return TRUE;
+}
+
+static void
+nv40_vertprog_translate(struct nv40_context *nv40,
+			struct nv40_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_vpc *vpc = NULL;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int i;
+
+	vpc = CALLOC(1, sizeof(struct nv40_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+
+	if (!nv40_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	/* Redirect post-transform vertex position to a temp if user clip
+	 * planes are enabled.  We need to append code the the vtxprog
+	 * to handle clip planes later.
+	 */
+	if (vp->ucp.nr)  {
+		vpc->r_result[vpc->hpos_idx] = temp(vpc);
+		vpc->r_temps_discard = 0;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(imm->Immediate.NrTokens == 4 + 1);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Write out HPOS if it was redirected to a temp earlier */
+	if (vpc->r_result[vpc->hpos_idx].type != NV40SR_OUTPUT) {
+		struct nv40_sreg hpos = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_POS);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+
+		arith(vpc, 0, OP_MOV, hpos, MASK_ALL, htmp, none, none);
+	}
+
+	/* Insert code to handle user clip planes */
+	for (i = 0; i < vp->ucp.nr; i++) {
+		struct nv40_sreg cdst = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_CLIP(i));
+		struct nv40_sreg ceqn = constant(vpc, -1,
+						 nv40->clip.ucp[i][0],
+						 nv40->clip.ucp[i][1],
+						 nv40->clip.ucp[i][2],
+						 nv40->clip.ucp[i][3]);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		unsigned mask;
+
+		switch (i) {
+		case 0: case 3: mask = MASK_Y; break;
+		case 1: case 4: mask = MASK_Z; break;
+		case 2: case 5: mask = MASK_W; break;
+		default:
+			NOUVEAU_ERR("invalid clip dist #%d\n", i);
+			goto out_err;
+		}
+
+		arith(vpc, 0, OP_DP4, cdst, mask, htmp, ceqn, none);
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV40_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (vpc->r_temp)
+		FREE(vpc->r_temp); 
+	if (vpc->r_address)
+		FREE(vpc->r_address); 
+	if (vpc->imm)	
+		FREE(vpc->imm); 
+	FREE(vpc);
+}
+
+static boolean
+nv40_vertprog_validate(struct nv40_context *nv40)
+{ 
+	struct nouveau_winsys *nvws = nv40->nvws;
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	if (nv40->render_mode == HW) {
+		vp = nv40->vertprog;
+		constbuf = nv40->constbuf[PIPE_SHADER_VERTEX];
+
+		if ((nv40->dirty & NV40_NEW_UCP) ||
+		    memcmp(&nv40->clip, &vp->ucp, sizeof(vp->ucp))) {
+			nv40_vertprog_destroy(nv40, vp);
+			memcpy(&vp->ucp, &nv40->clip, sizeof(vp->ucp));
+		}
+	} else {
+		vp = nv40->swtnl.vertprog;
+		constbuf = NULL;
+	}
+
+	/* Translate TGSI shader into hw bytecode */
+	if (vp->translated)
+		goto check_gpu_resources;
+
+	nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG;
+	nv40_vertprog_translate(nv40, vp);
+	if (!vp->translated) {
+		nv40->fallback_swtnl |= NV40_NEW_VERTPROG;
+		return FALSE;
+	}
+
+check_gpu_resources:
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv40->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(7, 0);
+		so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
+		so_data  (so, vp->ir);
+		so_data  (so, vp->or);
+		so_method(so, curie,  NV40TCL_CLIP_PLANE_ENABLE, 1);
+		so_data  (so, vp->clip_ctrl);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv40->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV40_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV40_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv40_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf)
+			ws->buffer_unmap(ws, constbuf);
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv40->state.hw[NV40_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv40->state.hw[NV40_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv40->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = vp->clip_ctrl = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv40_state_entry nv40_state_vertprog = {
+	.validate = nv40_vertprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VERTPROG | NV40_NEW_UCP,
+		.hw = NV40_STATE_VERTPROG,
+	}
+};
+
diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile
new file mode 100644
index 0000000000..be30400c03
--- /dev/null
+++ b/src/gallium/drivers/nv50/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv50
+
+DRIVER_SOURCES = \
+	nv50_clear.c \
+	nv50_context.c \
+	nv50_draw.c \
+	nv50_miptree.c \
+	nv50_query.c \
+	nv50_program.c \
+	nv50_screen.c \
+	nv50_state.c \
+	nv50_state_validate.c \
+	nv50_surface.c \
+	nv50_tex.c \
+	nv50_vbo.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv50/nv50_clear.c b/src/gallium/drivers/nv50/nv50_clear.c
new file mode 100644
index 0000000000..f9bc3b53ca
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_clear.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv50_context.h"
+
+void
+nv50_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct pipe_framebuffer_state fb, s_fb = nv50->framebuffer;
+	struct pipe_scissor_state sc, s_sc = nv50->scissor;
+	unsigned dirty = nv50->dirty;
+
+	nv50->dirty = 0;
+
+	if (ps->format == PIPE_FORMAT_Z24S8_UNORM ||
+	    ps->format == PIPE_FORMAT_Z16_UNORM) {
+		fb.nr_cbufs = 0;
+		fb.zsbuf = ps;
+	} else {
+		fb.nr_cbufs = 1;
+		fb.cbufs[0] = ps;
+		fb.zsbuf = NULL;
+	}
+	fb.width = ps->width;
+	fb.height = ps->height;
+	pipe->set_framebuffer_state(pipe, &fb);
+
+	sc.minx = sc.miny = 0;
+	sc.maxx = fb.width;
+	sc.maxy = fb.height;
+	pipe->set_scissor_state(pipe, &sc);
+
+	nv50_state_validate(nv50);
+
+	switch (ps->format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		BEGIN_RING(chan, tesla, 0x0d80, 4);
+		OUT_RINGf (chan, ubyte_to_float((clearValue >> 16) & 0xff));
+		OUT_RINGf (chan, ubyte_to_float((clearValue >>  8) & 0xff));
+		OUT_RINGf (chan, ubyte_to_float((clearValue >>  0) & 0xff));
+		OUT_RINGf (chan, ubyte_to_float((clearValue >> 24) & 0xff));
+		BEGIN_RING(chan, tesla, 0x19d0, 1);
+		OUT_RING  (chan, 0x3c);
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+		BEGIN_RING(chan, tesla, 0x0d90, 1);
+		OUT_RINGf (chan, (float)(clearValue >> 8) * (1.0 / 16777215.0));
+		BEGIN_RING(chan, tesla, 0x0da0, 1);
+		OUT_RING  (chan, clearValue & 0xff);
+		BEGIN_RING(chan, tesla, 0x19d0, 1);
+		OUT_RING  (chan, 0x03);
+		break;
+	default:
+		pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
+				   clearValue);
+		break;
+	}
+
+	pipe->set_framebuffer_state(pipe, &s_fb);
+	pipe->set_scissor_state(pipe, &s_sc);
+	nv50->dirty |= dirty;
+
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
new file mode 100644
index 0000000000..565a5da668
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+
+static void
+nv50_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	
+	FIRE_RING(nv50->screen->nvws->channel);
+}
+
+static void
+nv50_destroy(struct pipe_context *pipe)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+
+	draw_destroy(nv50->draw);
+	FREE(nv50);
+}
+
+
+static void
+nv50_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct pipe_winsys *pipe_winsys = pscreen->winsys;
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nv50_context *nv50;
+
+	nv50 = CALLOC_STRUCT(nv50_context);
+	if (!nv50)
+		return NULL;
+	nv50->screen = screen;
+	nv50->pctx_id = pctx_id;
+
+	nv50->pipe.winsys = pipe_winsys;
+	nv50->pipe.screen = pscreen;
+
+	nv50->pipe.destroy = nv50_destroy;
+
+	nv50->pipe.set_edgeflags = nv50_set_edgeflags;
+	nv50->pipe.draw_arrays = nv50_draw_arrays;
+	nv50->pipe.draw_elements = nv50_draw_elements;
+	nv50->pipe.clear = nv50_clear;
+
+	nv50->pipe.flush = nv50_flush;
+
+	nv50_init_surface_functions(nv50);
+	nv50_init_state_functions(nv50);
+	nv50_init_query_functions(nv50);
+
+	nv50->draw = draw_create();
+	assert(nv50->draw);
+	draw_set_rasterize_stage(nv50->draw, nv50_draw_render_stage(nv50));
+
+	return &nv50->pipe;
+}
+
+		
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
new file mode 100644
index 0000000000..1e9d45cb34
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -0,0 +1,207 @@
+#ifndef __NV50_CONTEXT_H__
+#define __NV50_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv50_screen.h"
+#include "nv50_program.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+/* Constant buffer assignment */
+#define NV50_CB_PMISC		0
+#define NV50_CB_PVP		1
+#define NV50_CB_PFP		2
+#define NV50_CB_PGP		3
+#define NV50_CB_TIC		4
+#define NV50_CB_TSC		5
+#define NV50_CB_PUPLOAD         6
+
+#define NV50_NEW_BLEND		(1 << 0)
+#define NV50_NEW_ZSA		(1 << 1)
+#define NV50_NEW_BLEND_COLOUR	(1 << 2)
+#define NV50_NEW_STIPPLE	(1 << 3)
+#define NV50_NEW_SCISSOR	(1 << 4)
+#define NV50_NEW_VIEWPORT	(1 << 5)
+#define NV50_NEW_RASTERIZER	(1 << 6)
+#define NV50_NEW_FRAMEBUFFER	(1 << 7)
+#define NV50_NEW_VERTPROG	(1 << 8)
+#define NV50_NEW_VERTPROG_CB	(1 << 9)
+#define NV50_NEW_FRAGPROG	(1 << 10)
+#define NV50_NEW_FRAGPROG_CB	(1 << 11)
+#define NV50_NEW_ARRAYS		(1 << 12)
+#define NV50_NEW_SAMPLER	(1 << 13)
+#define NV50_NEW_TEXTURE	(1 << 14)
+
+struct nv50_blend_stateobj {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_zsa_stateobj {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_rasterizer_stateobj {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_miptree_level {
+	struct pipe_buffer **image;
+	int *image_offset;
+	unsigned image_dirty_cpu[512/32];
+	unsigned image_dirty_gpu[512/32];
+};
+
+struct nv50_miptree {
+	struct pipe_texture base;
+	struct pipe_buffer *buffer;
+
+	struct nv50_miptree_level level[PIPE_MAX_TEXTURE_LEVELS];
+	int image_nr;
+	int total_size;
+};
+
+static INLINE struct nv50_miptree *
+nv50_miptree(struct pipe_texture *pt)
+{
+	return (struct nv50_miptree *)pt;
+}
+
+struct nv50_surface {
+	struct pipe_surface base;
+};
+
+static INLINE struct nv50_surface *
+nv50_surface(struct pipe_surface *pt)
+{
+	return (struct nv50_surface *)pt;
+}
+
+static INLINE struct pipe_buffer *
+nv50_surface_buffer(struct pipe_surface *surface)
+{
+	struct nv50_miptree *mt = (struct nv50_miptree *)surface->texture;
+	return mt->buffer;
+}
+
+struct nv50_state {
+	unsigned dirty;
+
+	struct nouveau_stateobj *fb;
+	struct nouveau_stateobj *blend;
+	struct nouveau_stateobj *blend_colour;
+	struct nouveau_stateobj *zsa;
+	struct nouveau_stateobj *rast;
+	struct nouveau_stateobj *stipple;
+	struct nouveau_stateobj *scissor;
+	unsigned scissor_enabled;
+	struct nouveau_stateobj *viewport;
+	unsigned viewport_bypass;
+	struct nouveau_stateobj *tsc_upload;
+	struct nouveau_stateobj *tic_upload;
+	struct nouveau_stateobj *vertprog;
+	struct nouveau_stateobj *fragprog;
+	struct nouveau_stateobj *vtxfmt;
+	struct nouveau_stateobj *vtxbuf;
+};
+
+struct nv50_context {
+	struct pipe_context pipe;
+
+	struct nv50_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	struct nv50_state state;
+
+	unsigned dirty;
+	struct nv50_blend_stateobj *blend;
+	struct nv50_zsa_stateobj *zsa;
+	struct nv50_rasterizer_stateobj *rasterizer;
+	struct pipe_blend_color blend_colour;
+	struct pipe_poly_stipple stipple;
+	struct pipe_scissor_state scissor;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct nv50_program *vertprog;
+	struct nv50_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	unsigned *sampler[PIPE_MAX_SAMPLERS];
+	unsigned sampler_nr;
+	struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS];
+	unsigned miptree_nr;
+};
+
+static INLINE struct nv50_context *
+nv50_context(struct pipe_context *pipe)
+{
+	return (struct nv50_context *)pipe;
+}
+
+extern void nv50_init_surface_functions(struct nv50_context *nv50);
+extern void nv50_init_state_functions(struct nv50_context *nv50);
+extern void nv50_init_query_functions(struct nv50_context *nv50);
+
+extern void nv50_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+extern int
+nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h);
+
+/* nv50_draw.c */
+extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *nv50);
+
+/* nv50_vbo.c */
+extern boolean nv50_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv50_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+extern void nv50_vbo_validate(struct nv50_context *nv50);
+
+/* nv50_clear.c */
+extern void nv50_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv50_program.c */
+extern void nv50_vertprog_validate(struct nv50_context *nv50);
+extern void nv50_fragprog_validate(struct nv50_context *nv50);
+extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p);
+
+/* nv50_state_validate.c */
+extern boolean nv50_state_validate(struct nv50_context *nv50);
+
+/* nv50_tex.c */
+extern void nv50_tex_validate(struct nv50_context *);
+
+/* nv50_miptree.c */
+extern void nv50_miptree_sync(struct pipe_screen *, struct nv50_miptree *,
+			      unsigned level, unsigned image);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_draw.c b/src/gallium/drivers/nv50/nv50_draw.c
new file mode 100644
index 0000000000..2f6f607261
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_draw.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_pipe.h"
+
+#include "nv50_context.h"
+
+struct nv50_render_stage {
+	struct draw_stage stage;
+	struct nv50_context *nv50;
+};
+
+static INLINE struct nv50_render_stage *
+nv50_render_stage(struct draw_stage *stage)
+{
+	return (struct nv50_render_stage *)stage;
+}
+
+static void
+nv50_render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_flush(struct draw_stage *stage, unsigned flags)
+{
+}
+
+static void
+nv50_render_reset_stipple_counter(struct draw_stage *stage)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_destroy(struct draw_stage *stage)
+{
+	FREE(stage);
+}
+
+struct draw_stage *
+nv50_draw_render_stage(struct nv50_context *nv50)
+{
+	struct nv50_render_stage *rs = CALLOC_STRUCT(nv50_render_stage);
+
+	rs->nv50 = nv50;
+	rs->stage.draw = nv50->draw;
+	rs->stage.destroy = nv50_render_destroy;
+	rs->stage.point = nv50_render_point;
+	rs->stage.line = nv50_render_line;
+	rs->stage.tri = nv50_render_tri;
+	rs->stage.flush = nv50_render_flush;
+	rs->stage.reset_stipple_counter = nv50_render_reset_stipple_counter;
+
+	return &rs->stage;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
new file mode 100644
index 0000000000..91091d53f5
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv50_context.h"
+
+static struct pipe_texture *
+nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
+	struct pipe_texture *pt = &mt->base;
+	unsigned usage, width = tmp->width[0], height = tmp->height[0];
+	unsigned depth = tmp->depth[0];
+	int i, l;
+
+	mt->base = *tmp;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+
+	usage = PIPE_BUFFER_USAGE_PIXEL;
+	switch (pt->format) {
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+		usage |= NOUVEAU_BUFFER_USAGE_ZETA;
+		break;
+	default:
+		break;
+	}
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_3D:
+		mt->image_nr = pt->depth[0];
+		break;
+	case PIPE_TEXTURE_CUBE:
+		mt->image_nr = 6;
+		break;
+	default:
+		mt->image_nr = 1;
+		break;
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+		struct nv50_miptree_level *lvl = &mt->level[l];
+
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		lvl->image_offset = CALLOC(mt->image_nr, sizeof(int));
+		lvl->image = CALLOC(mt->image_nr, sizeof(struct pipe_buffer *));
+
+		width = MAX2(1, width >> 1);
+		height = MAX2(1, height >> 1);
+		depth = MAX2(1, depth >> 1);
+	}
+
+	for (i = 0; i < mt->image_nr; i++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			struct nv50_miptree_level *lvl = &mt->level[l];
+			int size;
+
+			size  = align(pt->width[l], 8) * pt->block.size;
+			size  = align(size, 64);
+			size *= align(pt->height[l], 8) * pt->block.size;
+
+			lvl->image[i] = ws->buffer_create(ws, 256, 0, size);
+			lvl->image_offset[i] = mt->total_size;
+
+			mt->total_size += size;
+		}
+	}
+
+	mt->buffer = ws->buffer_create(ws, 256, usage, mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv50_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv50_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->image_nr = 1;
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static INLINE void
+mark_dirty(uint32_t *flags, unsigned image)
+{
+	flags[image / 32] |= (1 << (image % 32));
+}
+
+static INLINE void
+mark_clean(uint32_t *flags, unsigned image)
+{
+	flags[image / 32] &= ~(1 << (image % 32));
+}
+
+static INLINE int
+is_dirty(uint32_t *flags, unsigned image)
+{
+	return !!(flags[image / 32] & (1 << (image % 32)));
+}
+
+static void
+nv50_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+
+	*ppt = NULL;
+
+	if (--pt->refcount <= 0) {
+		struct nv50_miptree *mt = nv50_miptree(pt);
+
+		pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+		FREE(mt);
+	}
+}
+
+void
+nv50_miptree_sync(struct pipe_screen *pscreen, struct nv50_miptree *mt,
+		  unsigned level, unsigned image)
+{
+	struct nv50_screen *nvscreen = nv50_screen(pscreen);
+	struct nv50_miptree_level *lvl = &mt->level[level];
+	struct pipe_surface *dst, *src;
+	unsigned face = 0, zslice = 0;
+
+	if (!is_dirty(lvl->image_dirty_cpu, image))
+		return;
+
+	if (mt->base.target == PIPE_TEXTURE_CUBE)
+		face = image;
+	else
+	if (mt->base.target == PIPE_TEXTURE_3D)
+		zslice = image;
+
+	/* Mark as clean already - so we don't continually call this function
+	 * trying to get a GPU_WRITE pipe_surface!
+	 */
+	mark_clean(lvl->image_dirty_cpu, image);
+
+	/* Pretend we're doing CPU access so we get the backing pipe_surface
+	 * and not a view into the larger miptree.
+	 */
+	src = pscreen->get_tex_surface(pscreen, &mt->base, face, level, zslice,
+				       PIPE_BUFFER_USAGE_CPU_READ);
+
+	/* Pretend we're only reading with the GPU so surface doesn't get marked
+	 * as dirtied by the GPU.
+	 */
+	dst = pscreen->get_tex_surface(pscreen, &mt->base, face, level, zslice,
+				       PIPE_BUFFER_USAGE_GPU_READ);
+
+	nv50_surface_do_copy(nvscreen, dst, 0, 0, src, 0, 0, dst->width, dst->height);
+
+	pscreen->tex_surface_release(pscreen, &dst);
+	pscreen->tex_surface_release(pscreen, &src);
+}
+
+/* The reverse of the above */
+static void
+nv50_miptree_sync_cpu(struct pipe_screen *pscreen, struct nv50_miptree *mt,
+		      unsigned level, unsigned image)
+{
+	struct nv50_screen *nvscreen = nv50_screen(pscreen);
+	struct nv50_miptree_level *lvl = &mt->level[level];
+	struct pipe_surface *dst, *src;
+	unsigned face = 0, zslice = 0;
+
+	if (!is_dirty(lvl->image_dirty_gpu, image))
+		return;
+
+	if (mt->base.target == PIPE_TEXTURE_CUBE)
+		face = image;
+	else
+	if (mt->base.target == PIPE_TEXTURE_3D)
+		zslice = image;
+
+	mark_clean(lvl->image_dirty_gpu, image);
+
+	src = pscreen->get_tex_surface(pscreen, &mt->base, face, level, zslice,
+				       PIPE_BUFFER_USAGE_GPU_READ);
+	dst = pscreen->get_tex_surface(pscreen, &mt->base, face, level, zslice,
+				       PIPE_BUFFER_USAGE_CPU_READ);
+
+	nv50_surface_do_copy(nvscreen, dst, 0, 0, src, 0, 0, dst->width, dst->height);
+
+	pscreen->tex_surface_release(pscreen, &dst);
+	pscreen->tex_surface_release(pscreen, &src);
+}
+
+static struct pipe_surface *
+nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv50_miptree *mt = nv50_miptree(pt);
+	struct nv50_miptree_level *lvl = &mt->level[level];
+	struct pipe_surface *ps;
+	int img;
+
+	if (pt->target == PIPE_TEXTURE_CUBE)
+		img = face;
+	else
+	if (pt->target == PIPE_TEXTURE_3D)
+		img = zslice;
+	else
+		img = 0;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = ps->width * ps->block.size;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	if (flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
+		assert(!(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE));
+		nv50_miptree_sync_cpu(pscreen, mt, level, img);
+
+		ps->offset = 0;
+		pipe_texture_reference(&ps->texture, pt);
+
+		if (flags & PIPE_BUFFER_USAGE_CPU_WRITE)
+			mark_dirty(lvl->image_dirty_cpu, img);
+	} else {
+		nv50_miptree_sync(pscreen, mt, level, img);
+
+		ps->offset = lvl->image_offset[img];
+		pipe_texture_reference(&ps->texture, pt);
+
+		if (flags & PIPE_BUFFER_USAGE_GPU_WRITE)
+			mark_dirty(lvl->image_dirty_gpu, img);
+	}
+
+	return ps;
+}
+
+static void
+nv50_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+	struct nv50_surface *s = nv50_surface(ps);
+
+	*psurface = NULL;
+
+	if (--ps->refcount <= 0) {
+		pipe_texture_reference(&ps->texture, NULL);
+		FREE(s);
+	}
+}
+
+void
+nv50_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv50_miptree_create;
+	pscreen->texture_blanket = nv50_miptree_blanket;
+	pscreen->texture_release = nv50_miptree_release;
+	pscreen->get_tex_surface = nv50_miptree_surface_new;
+	pscreen->tex_surface_release = nv50_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
new file mode 100644
index 0000000000..14c5d47e79
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -0,0 +1,1784 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv50_context.h"
+
+#define NV50_SU_MAX_TEMP 64
+//#define NV50_PROGRAM_DUMP
+
+/* ARL - gallium craps itself on progs/vp/arl.txt
+ *
+ * MSB - Like MAD, but MUL+SUB
+ * 	- Fuck it off, introduce a way to negate args for ops that
+ * 	  support it.
+ *
+ * Look into inlining IMMD for ops other than MOV (make it general?)
+ * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
+ * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
+ *
+ * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
+ * case, if the emit_src() causes the inst to suddenly become long.
+ *
+ * Verify half-insns work where expected - and force disable them where they
+ * don't work - MUL has it forcibly disabled atm as it fixes POW..
+ *
+ * FUCK! watch dst==src vectors, can overwrite components that are needed.
+ * 	ie. SUB R0, R0.yzxw, R0
+ *
+ * Things to check with renouveau:
+ * 	FP attr/result assignment - how?
+ * 		attrib
+ * 			- 0x16bc maps vp output onto fp hpos
+ * 			- 0x16c0 maps vp output onto fp col0
+ * 		result
+ * 			- colr always 0-3
+ * 			- depr always 4
+ * 0x16bc->0x16e8 --> some binding between vp/fp regs
+ * 0x16b8 --> VP output count
+ *
+ * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
+ * 	      "MOV rcol.x, fcol.y" = 0x00000004
+ * 0x19a8 --> as above but 0x00000100 and 0x00000000
+ * 	- 0x00100000 used when KIL used
+ * 0x196c --> as above but 0x00000011 and 0x00000000
+ *
+ * 0x1988 --> 0xXXNNNNNN
+ * 	- XX == FP high something
+ */
+struct nv50_reg {
+	enum {
+		P_TEMP,
+		P_ATTR,
+		P_RESULT,
+		P_CONST,
+		P_IMMD
+	} type;
+	int index;
+
+	int hw;
+	int neg;
+};
+
+struct nv50_pc {
+	struct nv50_program *p;
+
+	/* hw resources */
+	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
+
+	/* tgsi resources */
+	struct nv50_reg *temp;
+	int temp_nr;
+	struct nv50_reg *attr;
+	int attr_nr;
+	struct nv50_reg *result;
+	int result_nr;
+	struct nv50_reg *param;
+	int param_nr;
+	struct nv50_reg *immd;
+	float *immd_buf;
+	int immd_nr;
+
+	struct nv50_reg *temp_temp[16];
+	unsigned temp_temp_nr;
+};
+
+static void
+alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	int i;
+
+	if (reg->type == P_RESULT) {
+		if (pc->p->cfg.high_result < (reg->hw + 1))
+			pc->p->cfg.high_result = reg->hw + 1;
+	}
+
+	if (reg->type != P_TEMP)
+		return;
+
+	if (reg->hw >= 0) {
+		/*XXX: do this here too to catch FP temp-as-attr usage..
+		 *     not clean, but works */
+		if (pc->p->cfg.high_temp < (reg->hw + 1))
+			pc->p->cfg.high_temp = reg->hw + 1;
+		return;
+	}
+
+	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+		if (!(pc->r_temp[i])) {
+			pc->r_temp[i] = reg;
+			reg->hw = i;
+			if (pc->p->cfg.high_temp < (i + 1))
+				pc->p->cfg.high_temp = i + 1;
+			return;
+		}
+	}
+
+	assert(0);
+}
+
+static struct nv50_reg *
+alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
+{
+	struct nv50_reg *r;
+	int i;
+
+	if (dst && dst->type == P_TEMP && dst->hw == -1)
+		return dst;
+
+	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+		if (!pc->r_temp[i]) {
+			r = CALLOC_STRUCT(nv50_reg);
+			r->type = P_TEMP;
+			r->index = -1;
+			r->hw = i;
+			pc->r_temp[i] = r;
+			return r;
+		}
+	}
+
+	assert(0);
+	return NULL;
+}
+
+static void
+free_temp(struct nv50_pc *pc, struct nv50_reg *r)
+{
+	if (r->index == -1) {
+		unsigned hw = r->hw;
+
+		FREE(pc->r_temp[hw]);
+		pc->r_temp[hw] = NULL;
+	}
+}
+
+static int
+alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
+{
+	int i;
+
+	if ((idx + 4) >= NV50_SU_MAX_TEMP)
+		return 1;
+
+	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
+	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
+		return alloc_temp4(pc, dst, idx + 1);
+
+	for (i = 0; i < 4; i++) {
+		dst[i] = CALLOC_STRUCT(nv50_reg);
+		dst[i]->type = P_TEMP;
+		dst[i]->index = -1;
+		dst[i]->hw = idx + i;
+		pc->r_temp[idx + i] = dst[i];
+	}
+
+	return 0;
+}
+
+static void
+free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
+{
+	int i;
+
+	for (i = 0; i < 4; i++)
+		free_temp(pc, reg[i]);
+}
+
+static struct nv50_reg *
+temp_temp(struct nv50_pc *pc)
+{
+	if (pc->temp_temp_nr >= 16)
+		assert(0);
+
+	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
+	return pc->temp_temp[pc->temp_temp_nr++];
+}
+
+static void
+kill_temp_temp(struct nv50_pc *pc)
+{
+	int i;
+	
+	for (i = 0; i < pc->temp_temp_nr; i++)
+		free_temp(pc, pc->temp_temp[i]);
+	pc->temp_temp_nr = 0;
+}
+
+static int
+ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
+{
+	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
+			       (pc->immd_nr + 1) * 4 * sizeof(float));
+	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
+	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
+	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
+	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
+	
+	return pc->immd_nr++;
+}
+
+static struct nv50_reg *
+alloc_immd(struct nv50_pc *pc, float f)
+{
+	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
+	unsigned hw;
+
+	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
+	r->type = P_IMMD;
+	r->hw = hw;
+	r->index = -1;
+	return r;
+}
+
+static struct nv50_program_exec *
+exec(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
+
+	e->param.index = -1;
+	return e;
+}
+
+static void
+emit(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	struct nv50_program *p = pc->p;
+
+	if (p->exec_tail)
+		p->exec_tail->next = e;
+	if (!p->exec_head)
+		p->exec_head = e;
+	p->exec_tail = e;
+	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
+}
+
+static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
+
+static boolean
+is_long(struct nv50_program_exec *e)
+{
+	if (e->inst[0] & 1)
+		return TRUE;
+	return FALSE;
+}
+
+static boolean
+is_immd(struct nv50_program_exec *e)
+{
+	if (is_long(e) && (e->inst[1] & 3) == 3)
+		return TRUE;
+	return FALSE;
+}
+
+static INLINE void
+set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
+	 struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
+	e->inst[1] |= (pred << 7) | (idx << 12);
+}
+
+static INLINE void
+set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
+	    struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
+	e->inst[1] |= (idx << 4) | (on << 6);
+}
+
+static INLINE void
+set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	if (is_long(e))
+		return;
+
+	e->inst[0] |= 1;
+	set_pred(pc, 0xf, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+}
+
+static INLINE void
+set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
+{
+	if (dst->type == P_RESULT) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00000008;
+	}
+
+	alloc_reg(pc, dst);
+	e->inst[0] |= (dst->hw << 2);
+}
+
+static INLINE void
+set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
+{
+	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
+
+	set_long(pc, e);
+	/*XXX: can't be predicated - bits overlap.. catch cases where both
+	 *     are required and avoid them. */
+	set_pred(pc, 0, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+
+	e->inst[1] |= 0x00000002 | 0x00000001;
+	e->inst[0] |= (val & 0x3f) << 16;
+	e->inst[1] |= (val >> 6) << 2;
+}
+
+static void
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
+	    struct nv50_reg *src, struct nv50_reg *iv)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x80000000;
+	set_dst(pc, dst, e);
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 16);
+	if (iv) {
+		e->inst[0] |= (1 << 25);
+		alloc_reg(pc, iv);
+		e->inst[0] |= (iv->hw << 9);
+	}
+
+	emit(pc, e);
+}
+
+static void
+set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
+	 struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+#if 1
+	e->inst[1] |= (1 << 22);
+#else
+	if (src->type == P_IMMD) {
+		e->inst[1] |= (NV50_CB_PMISC << 22);
+	} else {
+		if (pc->p->type == PIPE_SHADER_VERTEX)
+			e->inst[1] |= (NV50_CB_PVP << 22);
+		else
+			e->inst[1] |= (NV50_CB_PFP << 22);
+	}
+#endif
+
+	e->param.index = src->hw;
+	e->param.shift = s;
+	e->param.mask = m << (s % 32);
+}
+
+static void
+emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x10000000;
+
+	set_dst(pc, dst, e);
+
+	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
+		set_immd(pc, src, e);
+		/*XXX: 32-bit, but steals part of "half" reg space - need to
+		 *     catch and handle this case if/when we do half-regs
+		 */
+		e->inst[0] |= 0x00008000;
+	} else
+	if (src->type == P_IMMD || src->type == P_CONST) {
+		set_long(pc, e);
+		set_data(pc, src, 0x7f, 9, e);
+		e->inst[1] |= 0x20000000; /* src0 const? */
+	} else {
+		if (src->type == P_ATTR) {
+			set_long(pc, e);
+			e->inst[1] |= 0x00200000;
+		}
+
+		alloc_reg(pc, src);
+		e->inst[0] |= (src->hw << 9);
+	}
+
+	/* We really should support "half" instructions here at some point,
+	 * but I don't feel confident enough about them yet.
+	 */
+	set_long(pc, e);
+	if (is_long(e) && !is_immd(e)) {
+		e->inst[1] |= 0x04000000; /* 32-bit */
+		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
+	}
+
+	emit(pc, e);
+}
+
+static boolean
+check_swap_src_0_1(struct nv50_pc *pc,
+		   struct nv50_reg **s0, struct nv50_reg **s1)
+{
+	struct nv50_reg *src0 = *s0, *src1 = *s1;
+
+	if (src0->type == P_CONST) {
+		if (src1->type != P_CONST) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	} else
+	if (src1->type == P_ATTR) {
+		if (src0->type != P_ATTR) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+static void
+set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00200000;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 9);
+}
+
+static void
+set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		assert(!(e->inst[0] & 0x00800000));
+		if (e->inst[0] & 0x01000000) {
+			struct nv50_reg *temp = temp_temp(pc);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			set_data(pc, src, 0x7f, 16, e);
+			e->inst[0] |= 0x00800000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 16);
+}
+
+static void
+set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		assert(!(e->inst[0] & 0x01000000));
+		if (e->inst[0] & 0x00800000) {
+			struct nv50_reg *temp = temp_temp(pc);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			set_data(pc, src, 0x7f, 32+14, e);
+			e->inst[0] |= 0x01000000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	e->inst[1] |= (src->hw << 14);
+}
+
+static void
+emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xc0000000;
+	set_long(pc, e);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	if (is_long(e))
+		set_src_2(pc, src1, e);
+	else
+		set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
+	    struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[0] |= 0xb0000000;
+	e->inst[1] |= (sub << 29);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_long(pc, e);
+	if (check_swap_src_0_1(pc, &src0, &src1))
+		e->inst[1] |= 0x04000000;
+	else
+		e->inst[1] |= 0x08000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_2(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xe0000000;
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xe0000000;
+	set_long(pc, e);
+	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_flop(struct nv50_pc *pc, unsigned sub,
+	  struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x90000000;
+	if (sub) {
+		set_long(pc, e);
+		e->inst[1] |= (sub << 29);
+	}
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29) | 0x00004000;
+
+	emit(pc, e);
+}
+
+static void
+emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29);
+
+	emit(pc, e);
+}
+
+static void
+emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+	struct nv50_reg *rdst;
+
+	assert(c_op <= 7);
+	if (check_swap_src_0_1(pc, &src0, &src1))
+		c_op = inv_cop[c_op];
+
+	rdst = dst;
+	if (dst->type != P_TEMP)
+		dst = alloc_temp(pc, NULL);
+
+	/* set.u32 */
+	set_long(pc, e);
+	e->inst[0] |= 0xb0000000;
+	e->inst[1] |= (3 << 29);
+	e->inst[1] |= (c_op << 14);
+	/*XXX: breaks things, .u32 by default?
+	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
+	 *     doesn't seem to match what the hw actually does.
+	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
+	 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	emit(pc, e);
+
+	/* cvt.f32.u32 */
+	e = exec(pc);
+	e->inst[0] = 0xa0000001;
+	e->inst[1] = 0x64014780;
+	set_dst(pc, rdst, e);
+	set_src_0(pc, dst, e);
+	emit(pc, e);
+
+	if (dst != rdst)
+		free_temp(pc, dst);
+}
+
+static void
+emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xa0000000; /* cvt */
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29); /* cvt */
+	e->inst[1] |= 0x08000000; /* integer mode */
+	e->inst[1] |= 0x04000000; /* 32 bit */
+	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *v, struct nv50_reg *e)
+{
+	struct nv50_reg *temp = alloc_temp(pc, NULL);
+
+	emit_flop(pc, 3, temp, v);
+	emit_mul(pc, temp, temp, e);
+	emit_preex2(pc, temp, temp);
+	emit_flop(pc, 6, dst, temp);
+
+	free_temp(pc, temp);
+}
+
+static void
+emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xa0000000; /* cvt */
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29); /* cvt */
+	e->inst[1] |= 0x04000000; /* 32 bit */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	e->inst[1] |= ((1 << 6) << 14); /* .abs */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src)
+{
+	struct nv50_reg *one = alloc_immd(pc, 1.0);
+	struct nv50_reg *zero = alloc_immd(pc, 0.0);
+	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
+	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
+	struct nv50_reg *tmp[4];
+
+	if (mask & (1 << 0))
+		emit_mov(pc, dst[0], one);
+
+	if (mask & (1 << 3))
+		emit_mov(pc, dst[3], one);
+
+	if (mask & (3 << 1)) {
+		if (mask & (1 << 1))
+			tmp[0] = dst[1];
+		else
+			tmp[0] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[0], src[0], zero);
+	}
+
+	if (mask & (1 << 2)) {
+		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
+
+		tmp[1] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[1], src[1], zero);
+
+		tmp[3] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[3], src[3], neg128);
+		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
+
+		emit_pow(pc, dst[2], tmp[1], tmp[3]);
+		emit_mov(pc, dst[2], zero);
+		set_pred(pc, 3, 0, pc->p->exec_tail);
+	}
+}
+
+static void
+emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[0] |= 0xa0000000; /* delta */
+	e->inst[1] |= (7 << 29); /* delta */
+	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e;
+	const int r_pred = 1;
+
+	/* Sets predicate reg ? */
+	e = exec(pc);
+	e->inst[0] = 0xa00001fd;
+	e->inst[1] = 0xc4014788;
+	set_src_0(pc, src, e);
+	set_pred_wr(pc, 1, r_pred, e);
+	emit(pc, e);
+
+	/* This is probably KILP */
+	e = exec(pc);
+	e->inst[0] = 0x000001fe;
+	set_long(pc, e);
+	set_pred(pc, 1 /* LT? */, r_pred, e);
+	emit(pc, e);
+}
+
+static struct nv50_reg *
+tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
+{
+	switch (dst->DstRegister.File) {
+	case TGSI_FILE_TEMPORARY:
+		return &pc->temp[dst->DstRegister.Index * 4 + c];
+	case TGSI_FILE_OUTPUT:
+		return &pc->result[dst->DstRegister.Index * 4 + c];
+	case TGSI_FILE_NULL:
+		return NULL;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+static struct nv50_reg *
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
+{
+	struct nv50_reg *r = NULL;
+	struct nv50_reg *temp;
+	unsigned c;
+
+	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
+	switch (c) {
+	case TGSI_EXTSWIZZLE_X:
+	case TGSI_EXTSWIZZLE_Y:
+	case TGSI_EXTSWIZZLE_Z:
+	case TGSI_EXTSWIZZLE_W:
+		switch (src->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			r = &pc->attr[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_TEMPORARY:
+			r = &pc->temp[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_CONSTANT:
+			r = &pc->param[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			r = &pc->immd[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_SAMPLER:
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	case TGSI_EXTSWIZZLE_ZERO:
+		r = alloc_immd(pc, 0.0);
+		break;
+	case TGSI_EXTSWIZZLE_ONE:
+		r = alloc_immd(pc, 1.0);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
+	case TGSI_UTIL_SIGN_KEEP:
+		break;
+	case TGSI_UTIL_SIGN_CLEAR:
+		temp = temp_temp(pc);
+		emit_abs(pc, temp, r);
+		r = temp;
+		break;
+	case TGSI_UTIL_SIGN_TOGGLE:
+		temp = temp_temp(pc);
+		emit_neg(pc, temp, r);
+		r = temp;
+		break;
+	case TGSI_UTIL_SIGN_SET:
+		temp = temp_temp(pc);
+		emit_abs(pc, temp, r);
+		emit_neg(pc, temp, r);
+		r = temp;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	return r;
+}
+
+static boolean
+nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+{
+	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
+	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
+	unsigned mask, sat, unit;
+	int i, c;
+
+	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
+
+	for (c = 0; c < 4; c++) {
+		if (mask & (1 << c))
+			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
+		else
+			dst[c] = NULL;
+	}
+
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
+
+		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
+			unit = fs->SrcRegister.Index;
+
+		for (c = 0; c < 4; c++)
+			src[i][c] = tgsi_src(pc, c, fs);
+	}
+
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			rdst[c] = dst[c];
+			dst[c] = temp_temp(pc);
+		}
+	}
+
+	switch (inst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_abs(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_ADD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_add(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_COS:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 5, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DP4:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_mad(pc, temp, src[0][3], src[1][3], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DPH:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_add(pc, temp, src[1][3], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DST:
+	{
+		struct nv50_reg *one = alloc_immd(pc, 1.0);
+		if (mask & (1 << 0))
+			emit_mov(pc, dst[0], one);
+		if (mask & (1 << 1))
+			emit_mul(pc, dst[1], src[0][1], src[1][1]);
+		if (mask & (1 << 2))
+			emit_mov(pc, dst[2], src[0][2]);
+		if (mask & (1 << 3))
+			emit_mov(pc, dst[3], src[1][3]);
+		FREE(one);
+	}
+		break;
+	case TGSI_OPCODE_EX2:
+		temp = alloc_temp(pc, NULL);
+		emit_preex2(pc, temp, src[0][0]);
+		emit_flop(pc, 6, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_FLR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_FRC:
+		temp = alloc_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, temp, src[0][c]);
+			emit_sub(pc, dst[c], src[0][c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_KIL:
+		emit_kil(pc, src[0][0]);
+		emit_kil(pc, src[0][1]);
+		emit_kil(pc, src[0][2]);
+		emit_kil(pc, src[0][3]);
+		break;
+	case TGSI_OPCODE_LIT:
+		emit_lit(pc, &dst[0], mask, &src[0][0]);
+		break;
+	case TGSI_OPCODE_LG2:
+		temp = alloc_temp(pc, NULL);
+		emit_flop(pc, 3, temp, src[0][0]);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_LRP:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			/*XXX: we can do better than this */
+			temp = alloc_temp(pc, NULL);
+			emit_neg(pc, temp, src[0][c]);
+			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
+			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
+			free_temp(pc, temp);
+		}
+		break;
+	case TGSI_OPCODE_MAD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+		}
+		break;
+	case TGSI_OPCODE_MAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MOV:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_MUL:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_POW:
+		temp = alloc_temp(pc, NULL);
+		emit_pow(pc, temp, src[0][0], src[1][0]);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_RCP:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flop(pc, 0, dst[c], src[0][0]);
+		}
+		break;
+	case TGSI_OPCODE_RSQ:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flop(pc, 2, dst[c], src[0][0]);
+		}
+		break;
+	case TGSI_OPCODE_SCS:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		if (mask & (1 << 0))
+			emit_flop(pc, 5, dst[0], temp);
+		if (mask & (1 << 1))
+			emit_flop(pc, 4, dst[1], temp);
+		break;
+	case TGSI_OPCODE_SGE:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 4, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_SLT:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_SUB:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sub(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+	{
+		struct nv50_reg *t[4];
+		struct nv50_program_exec *e;
+
+		alloc_temp4(pc, t, 0);
+		emit_mov(pc, t[0], src[0][0]);
+		emit_mov(pc, t[1], src[0][1]);
+
+		e = exec(pc);
+		e->inst[0] = 0xf6400000;
+		e->inst[0] |= (unit << 9);
+		set_long(pc, e);
+		e->inst[1] |= 0x0000c004;
+		set_dst(pc, t[0], e);
+		emit(pc, e);
+
+		if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]);
+		if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]);
+		if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]);
+		if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]);
+
+		free_temp4(pc, t);
+	}
+		break;
+	case TGSI_OPCODE_XPD:
+		temp = alloc_temp(pc, NULL);
+		if (mask & (1 << 0)) {
+			emit_mul(pc, temp, src[0][2], src[1][1]);
+			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
+		}
+		if (mask & (1 << 1)) {
+			emit_mul(pc, temp, src[0][0], src[1][2]);
+			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
+		}
+		if (mask & (1 << 2)) {
+			emit_mul(pc, temp, src[0][1], src[1][0]);
+			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_END:
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			struct nv50_program_exec *e;
+
+			if (!(mask & (1 << c)))
+				continue;
+			e = exec(pc);
+
+			e->inst[0] = 0xa0000000; /* cvt */
+			set_long(pc, e);
+			e->inst[1] |= (6 << 29); /* cvt */
+			e->inst[1] |= 0x04000000; /* 32 bit */
+			e->inst[1] |= (1 << 14); /* src .f32 */
+			e->inst[1] |= ((1 << 5) << 14); /* .sat */
+			set_dst(pc, rdst[c], e);
+			set_src_0(pc, dst[c], e);
+			emit(pc, e);
+		}
+	}
+
+	kill_temp_temp(pc);
+	return TRUE;
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+	struct tgsi_parse_context p;
+	boolean ret = FALSE;
+	unsigned i, c;
+
+	tgsi_parse_init(&p, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm =
+				&p.FullToken.FullImmediate;
+
+			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
+				      imm->u.ImmediateFloat32[1].Float,
+				      imm->u.ImmediateFloat32[2].Float,
+				      imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *d;
+			unsigned last;
+
+			d = &p.FullToken.FullDeclaration;
+			last = d->DeclarationRange.Last;
+
+			switch (d->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (pc->temp_nr < (last + 1))
+					pc->temp_nr = last + 1;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (pc->result_nr < (last + 1))
+					pc->result_nr = last + 1;
+				break;
+			case TGSI_FILE_INPUT:
+				if (pc->attr_nr < (last + 1))
+					pc->attr_nr = last + 1;
+				break;
+			case TGSI_FILE_CONSTANT:
+				if (pc->param_nr < (last + 1))
+					pc->param_nr = last + 1;
+				break;
+			case TGSI_FILE_SAMPLER:
+				break;
+			default:
+				NOUVEAU_ERR("bad decl file %d\n",
+					    d->Declaration.File);
+				goto out_err;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (pc->temp_nr) {
+		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->temp)
+			goto out_err;
+
+		for (i = 0; i < pc->temp_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->temp[i*4+c].type = P_TEMP;
+				pc->temp[i*4+c].hw = -1;
+				pc->temp[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->attr_nr) {
+		struct nv50_reg *iv = NULL;
+		int aid = 0;
+
+		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->attr)
+			goto out_err;
+
+		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+			iv = alloc_temp(pc, NULL);
+			emit_interp(pc, iv, iv, NULL);
+			emit_flop(pc, 0, iv, iv);
+			aid++;
+		}
+
+		for (i = 0; i < pc->attr_nr; i++) {
+			struct nv50_reg *a = &pc->attr[i*4];
+
+			for (c = 0; c < 4; c++) {
+				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+					struct nv50_reg *at =
+						alloc_temp(pc, NULL);
+					pc->attr[i*4+c].type = at->type;
+					pc->attr[i*4+c].hw = at->hw;
+					pc->attr[i*4+c].index = at->index;
+				} else {
+					pc->p->cfg.vp.attr[aid/32] |=
+						(1 << (aid % 32));
+					pc->attr[i*4+c].type = P_ATTR;
+					pc->attr[i*4+c].hw = aid++;
+					pc->attr[i*4+c].index = i;
+				}
+			}
+
+			if (pc->p->type != PIPE_SHADER_FRAGMENT)
+				continue;
+
+			emit_interp(pc, &a[0], &a[0], iv);
+			emit_interp(pc, &a[1], &a[1], iv);
+			emit_interp(pc, &a[2], &a[2], iv);
+			emit_interp(pc, &a[3], &a[3], iv);
+		}
+
+		if (iv)
+			free_temp(pc, iv);
+	}
+
+	if (pc->result_nr) {
+		int rid = 0;
+
+		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->result)
+			goto out_err;
+
+		for (i = 0; i < pc->result_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+					pc->result[i*4+c].type = P_TEMP;
+					pc->result[i*4+c].hw = -1;
+				} else {
+					pc->result[i*4+c].type = P_RESULT;
+					pc->result[i*4+c].hw = rid++;
+				}
+				pc->result[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->param_nr) {
+		int rid = 0;
+
+		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->param)
+			goto out_err;
+
+		for (i = 0; i < pc->param_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->param[i*4+c].type = P_CONST;
+				pc->param[i*4+c].hw = rid++;
+				pc->param[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->immd_nr) {
+		int rid = pc->param_nr * 4;
+
+		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->immd)
+			goto out_err;
+
+		for (i = 0; i < pc->immd_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->immd[i*4+c].type = P_IMMD;
+				pc->immd[i*4+c].hw = rid++;
+				pc->immd[i*4+c].index = i;
+			}
+		}
+	}
+
+	ret = TRUE;
+out_err:
+	tgsi_parse_free(&p);
+	return ret;
+}
+
+static boolean
+nv50_program_tx(struct nv50_program *p)
+{
+	struct tgsi_parse_context parse;
+	struct nv50_pc *pc;
+	boolean ret;
+
+	pc = CALLOC_STRUCT(nv50_pc);
+	if (!pc)
+		return FALSE;
+	pc->p = p;
+	pc->p->cfg.high_temp = 4;
+
+	ret = nv50_program_tx_prep(pc);
+	if (ret == FALSE)
+		goto out_cleanup;
+
+	tgsi_parse_init(&parse, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		const union tgsi_full_token *tok = &parse.FullToken;
+
+		tgsi_parse_token(&parse);
+
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			ret = nv50_program_tx_insn(pc, tok);
+			if (ret == FALSE)
+				goto out_err;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (p->type == PIPE_SHADER_FRAGMENT) {
+		struct nv50_reg out;
+
+		out.type = P_TEMP;
+		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
+			emit_mov(pc, &out, &pc->result[out.hw]);
+	}
+
+	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
+	pc->p->exec_tail->inst[1] |= 0x00000001;
+
+	p->param_nr = pc->param_nr * 4;
+	p->immd_nr = pc->immd_nr * 4;
+	p->immd = pc->immd_buf;
+
+out_err:
+	tgsi_parse_free(&parse);
+
+out_cleanup:
+	return ret;
+}
+
+static void
+nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
+{
+	if (nv50_program_tx(p) == FALSE)
+		assert(0);
+	p->translated = TRUE;
+}
+
+static void
+nv50_program_upload_data(struct nv50_context *nv50, float *map,
+			 unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	while (count) {
+		unsigned nr = count > 2047 ? 2047 : count;
+
+		BEGIN_RING(chan, tesla, 0x00000f00, 1);
+		OUT_RING  (chan, (NV50_CB_PMISC << 0) | (start << 8));
+		BEGIN_RING(chan, tesla, 0x40000f04, nr);
+		OUT_RINGp (chan, map, nr);
+
+		map += nr;
+		start += nr;
+		count -= nr;
+	}
+}
+
+static void
+nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct nouveau_winsys *nvws = nv50->screen->nvws;
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+	unsigned nr = p->param_nr + p->immd_nr;
+
+	if (!p->data && nr) {
+		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, nr, p, &p->data)) {
+			while (heap->next && heap->size < nr) {
+				struct nv50_program *evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, nr, p, &p->data))
+				assert(0);
+		}
+	}
+
+	if (p->param_nr) {
+		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
+					    PIPE_BUFFER_USAGE_CPU_READ);
+		nv50_program_upload_data(nv50, map, p->data->start,
+					 p->param_nr);
+		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
+	}
+
+	if (p->immd_nr) {
+		nv50_program_upload_data(nv50, p->immd,
+					 p->data->start + p->param_nr,
+					 p->immd_nr);
+	}
+}
+
+static void
+nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+	struct nv50_program_exec *e;
+	struct nouveau_stateobj *so;
+	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
+	unsigned start, count, *up, *ptr;
+	boolean upload = FALSE;
+
+	if (!p->buffer) {
+		p->buffer = ws->buffer_create(ws, 0x100, 0, p->exec_size * 4);
+		upload = TRUE;
+	}
+
+	if (p->data && p->data->start != p->data_start) {
+		for (e = p->exec_head; e; e = e->next) {
+			unsigned ei, ci;
+
+			if (e->param.index < 0)
+				continue;
+			ei = e->param.shift >> 5;
+			ci = e->param.index + p->data->start;
+
+			e->inst[ei] &= ~e->param.mask;
+			e->inst[ei] |= (ci << e->param.shift);
+		}
+
+		p->data_start = p->data->start;
+		upload = TRUE;
+	}
+
+	if (!upload)
+		return;
+
+#ifdef NV50_PROGRAM_DUMP
+	NOUVEAU_ERR("-------\n");
+	up = ptr = MALLOC(p->exec_size * 4);
+	for (e = p->exec_head; e; e = e->next) {
+		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
+		if (is_long(e))
+			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
+	}
+
+#endif
+
+	up = ptr = MALLOC(p->exec_size * 4);
+	for (e = p->exec_head; e; e = e->next) {
+		*(ptr++) = e->inst[0];
+		if (is_long(e))
+			*(ptr++) = e->inst[1];
+	}
+
+	so = so_new(4,2);
+	so_method(so, nv50->screen->tesla, 0x1280, 3);
+	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
+
+	start = 0; count = p->exec_size;
+	while (count) {
+		struct nouveau_winsys *nvws = nv50->screen->nvws;
+		unsigned nr;
+
+		so_emit(nvws, so);
+
+		nr = MIN2(count, 2047);
+		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
+		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
+			FIRE_RING(chan);
+			continue;
+		}
+
+		BEGIN_RING(chan, tesla, 0x0f00, 1);
+		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
+		BEGIN_RING(chan, tesla, 0x40000f04, nr);	
+		OUT_RINGp (chan, up + start, nr);
+
+		start += nr;
+		count -= nr;
+	}
+
+	FREE(up);
+	so_ref(NULL, &so);
+}
+
+void
+nv50_vertprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->vertprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(13, 2);
+	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, 0x1650, 2);
+	so_data  (so, p->cfg.vp.attr[0]);
+	so_data  (so, p->cfg.vp.attr[1]);
+	so_method(so, tesla, 0x16b8, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, 0x16ac, 2);
+	so_data  (so, p->cfg.high_result); //8);
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x140c, 1);
+	so_data  (so, 0); /* program start offset */
+	so_ref(so, &nv50->state.vertprog);
+}
+
+void
+nv50_fragprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->fragprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(64, 2);
+	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, 0x1904, 4);
+	so_data  (so, 0x00040404); /* p: 0x01000404 */
+	so_data  (so, 0x00000004);
+	so_data  (so, 0x00000000);
+	so_data  (so, 0x00000000);
+	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
+	so_data  (so, 0x03020100);
+	so_data  (so, 0x07060504);
+	so_data  (so, 0x0b0a0908);
+	so_method(so, tesla, 0x1988, 2);
+	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x1414, 1);
+	so_data  (so, 0); /* program start offset */
+	so_ref(so, &nv50->state.fragprog);
+}
+
+void
+nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct pipe_screen *pscreen = nv50->pipe.screen;
+
+	while (p->exec_head) {
+		struct nv50_program_exec *e = p->exec_head;
+
+		p->exec_head = e->next;
+		FREE(e);
+	}
+	p->exec_tail = NULL;
+	p->exec_size = 0;
+
+	if (p->buffer)
+		pipe_buffer_reference(pscreen, &p->buffer, NULL);
+
+	nv50->screen->nvws->res_free(&p->data);
+
+	p->translated = 0;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
new file mode 100644
index 0000000000..78deed6a38
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -0,0 +1,45 @@
+#ifndef __NV50_PROGRAM_H__
+#define __NV50_PROGRAM_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv50_program_exec {
+	struct nv50_program_exec *next;
+
+	unsigned inst[2];
+	struct {
+		int index;
+		unsigned mask;
+		unsigned shift;
+	} param;
+};
+
+struct nv50_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+	boolean translated;
+
+	unsigned type;
+	struct nv50_program_exec *exec_head;
+	struct nv50_program_exec *exec_tail;
+	unsigned exec_size;
+	struct nouveau_resource *data;
+	unsigned data_start;
+
+	struct pipe_buffer *buffer;
+
+	float *immd;
+	unsigned immd_nr;
+	unsigned param_nr;
+
+	struct {
+		unsigned high_temp;
+		unsigned high_result;
+		struct {
+			unsigned attr[2];
+		} vp;
+	} cfg;
+};
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
new file mode 100644
index 0000000000..20745ceab8
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+
+#include "nv50_context.h"
+
+struct nv50_query {
+	struct pipe_buffer *buffer;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv50_query *
+nv50_query(struct pipe_query *pipe)
+{
+	return (struct nv50_query *)pipe;
+}
+
+static struct pipe_query *
+nv50_query_create(struct pipe_context *pipe, unsigned type)
+{
+	struct pipe_winsys *ws = pipe->winsys;
+	struct nv50_query *q = CALLOC_STRUCT(nv50_query);
+
+	assert (q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+	q->type = type;
+
+	q->buffer = ws->buffer_create(ws, 256, 0, 16);
+	if (!q->buffer) {
+		FREE(q);
+		return NULL;
+	}
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv50_query *q = nv50_query(pq);
+
+	if (q) {
+		pipe_buffer_reference(pipe->screen, &q->buffer, NULL);
+		FREE(q);
+	}
+}
+
+static void
+nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_query *q = nv50_query(pq);
+
+	BEGIN_RING(chan, tesla, 0x1530, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, tesla, 0x1514, 1);
+	OUT_RING  (chan, 1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_query *q = nv50_query(pq);
+
+	WAIT_RING (chan, 5);
+	BEGIN_RING(chan, tesla, 0x1b00, 4);
+	OUT_RELOCh(chan, q->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, q->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RING  (chan, 0x00000000);
+	OUT_RING  (chan, 0x0100f002);
+	FIRE_RING (chan);
+}
+
+static boolean
+nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64_t *result)
+{
+	struct pipe_winsys *ws = pipe->winsys;
+	struct nv50_query *q = nv50_query(pq);
+
+	/*XXX: Want to be able to return FALSE here instead of blocking
+	 *     until the result is available..
+	 */
+
+	if (!q->ready) {
+		uint32_t *map = ws->buffer_map(ws, q->buffer,
+					       PIPE_BUFFER_USAGE_CPU_READ);
+		q->result = map[1];
+		q->ready = TRUE;
+		ws->buffer_unmap(ws, q->buffer);
+	}
+
+	*result = q->result;
+	return q->ready;
+}
+
+void
+nv50_init_query_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_query = nv50_query_create;
+	nv50->pipe.destroy_query = nv50_query_destroy;
+	nv50->pipe.begin_query = nv50_query_begin;
+	nv50->pipe.end_query = nv50_query_end;
+	nv50->pipe.get_query_result = nv50_query_result;
+}
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
new file mode 100644
index 0000000000..58d7a621a8
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_screen.h"
+
+#include "util/u_simple_screen.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+#define NV5X_GRCLASS5097_CHIPSETS 0x00000001
+#define NV8X_GRCLASS8297_CHIPSETS 0x00000050
+#define NV9X_GRCLASS8297_CHIPSETS 0x00000014
+
+static boolean
+nv50_screen_is_format_supported(struct pipe_screen *pscreen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static const char *
+nv50_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv50_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv50_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 32;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 1;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 8;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case NOUVEAU_CAP_HW_VTXBUF:	
+		return 1;
+	case NOUVEAU_CAP_HW_IDXBUF:	
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv50_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static void
+nv50_screen_destroy(struct pipe_screen *pscreen)
+{
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen);
+	struct nouveau_stateobj *so;
+	unsigned tesla_class = 0, ret;
+	unsigned chipset = nvws->channel->device->chipset;
+	int i;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 2D object */
+	ret = nvws->grobj_alloc(nvws, NV50_2D, &screen->eng2d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 2D object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* 3D object */
+	if ((chipset & 0xf0) != 0x50 && (chipset & 0xf0) != 0x80) {
+		NOUVEAU_ERR("Not a G8x chipset\n");
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	switch (chipset & 0xf0) {
+	case 0x50:
+		if (NV5X_GRCLASS5097_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x5097;
+		break;
+	case 0x80:
+		if (NV8X_GRCLASS8297_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x8297;
+		break;
+	case 0x90:
+		if (NV9X_GRCLASS8297_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x8297;
+		break;
+	default:
+		break;
+	}
+
+	if (tesla_class == 0) {
+		NOUVEAU_ERR("Unknown G8x chipset: NV%02x\n", chipset);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, tesla_class, &screen->tesla);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Sync notifier */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static 2D init */
+	so = so_new(64, 0);
+	so_method(so, screen->eng2d, NV50_2D_DMA_NOTIFY, 4);
+	so_data  (so, screen->sync->handle);
+	so_data  (so, screen->nvws->channel->vram->handle);
+	so_data  (so, screen->nvws->channel->vram->handle);
+	so_data  (so, screen->nvws->channel->vram->handle);
+	so_method(so, screen->eng2d, NV50_2D_OPERATION, 1);
+	so_data  (so, NV50_2D_OPERATION_SRCCOPY);
+	so_method(so, screen->eng2d, 0x0290, 1);
+	so_data  (so, 0);
+	so_method(so, screen->eng2d, 0x0888, 1);
+	so_data  (so, 1);
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+
+	/* Static tesla init */
+	so = so_new(256, 20);
+
+	so_method(so, screen->tesla, 0x1558, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, NV50TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->tesla, NV50TCL_DMA_UNK0(0),
+				     NV50TCL_DMA_UNK0__SIZE);
+	for (i = 0; i < NV50TCL_DMA_UNK0__SIZE; i++)
+		so_data(so, nvws->channel->vram->handle);
+	so_method(so, screen->tesla, NV50TCL_DMA_UNK1(0),
+				     NV50TCL_DMA_UNK1__SIZE);
+	for (i = 0; i < NV50TCL_DMA_UNK1__SIZE; i++)
+		so_data(so, nvws->channel->vram->handle);
+	so_method(so, screen->tesla, 0x121c, 1);
+	so_data  (so, 1);
+
+	so_method(so, screen->tesla, 0x13bc, 1);
+	so_data  (so, 0x54);
+	so_method(so, screen->tesla, 0x13ac, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, 0x16b8, 1);
+	so_data  (so, 8);
+
+	/* Shared constant buffer */
+	screen->constbuf = ws->buffer_create(ws, 0, 0, 128 * 4 * 4);
+	if (nvws->res_init(&screen->vp_data_heap, 0, 128)) {
+		NOUVEAU_ERR("Error initialising constant buffer\n");
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PMISC << 16) | 0x00001000);
+
+	/* Texture sampler/image unit setup - we abuse the constant buffer
+	 * upload mechanism for the moment to upload data to the tex config
+	 * blocks.  At some point we *may* want to go the NVIDIA way of doing
+	 * things?
+	 */
+	screen->tic = ws->buffer_create(ws, 0, 0, 32 * 8 * 4);
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_TIC << 16) | 0x0800);
+	so_method(so, screen->tesla, 0x1574, 3);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, 0x00000800);
+
+	screen->tsc = ws->buffer_create(ws, 0, 0, 32 * 8 * 4);
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_TSC << 16) | 0x0800);
+	so_method(so, screen->tesla, 0x155c, 3);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, 0x00000800);
+
+
+	/* Vertex array limits - max them out */
+	for (i = 0; i < 16; i++) {
+		so_method(so, screen->tesla, 0x1080 + (i * 8), 2);
+		so_data  (so, 0x000000ff);
+		so_data  (so, 0xffffffff);
+	}
+
+	so_method(so, screen->tesla, NV50TCL_DEPTH_RANGE_NEAR, 2);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+
+	so_method(so, screen->tesla, 0x1234, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, 0x1458, 1);
+	so_data  (so, 1);
+
+	so_emit(nvws, so);
+	so_ref(so, &screen->static_init);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+
+	screen->pipe.destroy = nv50_screen_destroy;
+
+	screen->pipe.get_name = nv50_screen_get_name;
+	screen->pipe.get_vendor = nv50_screen_get_vendor;
+	screen->pipe.get_param = nv50_screen_get_param;
+	screen->pipe.get_paramf = nv50_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv50_screen_is_format_supported;
+
+	nv50_screen_init_miptree_functions(&screen->pipe);
+	nv50_surface_init_screen_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
new file mode 100644
index 0000000000..c888ca071c
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -0,0 +1,34 @@
+#ifndef __NV50_SCREEN_H__
+#define __NV50_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv50_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	struct nouveau_grobj *tesla;
+	struct nouveau_grobj *eng2d;
+	struct nouveau_notifier *sync;
+
+	struct pipe_buffer *constbuf;
+	struct nouveau_resource *vp_data_heap;
+
+	struct pipe_buffer *tic;
+	struct pipe_buffer *tsc;
+
+	struct nouveau_stateobj *static_init;
+};
+
+static INLINE struct nv50_screen *
+nv50_screen(struct pipe_screen *screen)
+{
+	return (struct nv50_screen *)screen;
+}
+
+void nv50_surface_init_screen_functions(struct pipe_screen *);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
new file mode 100644
index 0000000000..787ff958ec
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -0,0 +1,664 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static void *
+nv50_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_blend_stateobj *bso = CALLOC_STRUCT(nv50_blend_stateobj);
+	unsigned cmask = 0, i;
+
+	/*XXX ignored:
+	 * 	- dither
+	 */
+
+	if (cso->blend_enable == 0) {
+		so_method(so, tesla, NV50TCL_BLEND_ENABLE(0), 8);
+		for (i = 0; i < 8; i++)
+			so_data(so, 0);
+	} else {
+		so_method(so, tesla, NV50TCL_BLEND_ENABLE(0), 8);
+		for (i = 0; i < 8; i++)
+			so_data(so, 1);
+		so_method(so, tesla, NV50TCL_BLEND_EQUATION_RGB, 5);
+		so_data  (so, nvgl_blend_eqn(cso->rgb_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rgb_dst_factor));
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->alpha_src_factor));
+		so_method(so, tesla, NV50TCL_BLEND_FUNC_DST_ALPHA, 1);
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->alpha_dst_factor));
+	}
+
+	if (cso->logicop_enable == 0 ) {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	} else {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	}
+
+	if (cso->colormask & PIPE_MASK_R)
+		cmask |= (1 << 0);
+	if (cso->colormask & PIPE_MASK_G)
+		cmask |= (1 << 4);
+	if (cso->colormask & PIPE_MASK_B)
+		cmask |= (1 << 8);
+	if (cso->colormask & PIPE_MASK_A)
+		cmask |= (1 << 12);
+	so_method(so, tesla, NV50TCL_COLOR_MASK(0), 8);
+	for (i = 0; i < 8; i++)
+		so_data(so, cmask);
+
+	bso->pipe = *cso;
+	so_ref(so, &bso->so);
+	return (void *)bso;
+}
+
+static void
+nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend = hwcso;
+	nv50->dirty |= NV50_NEW_BLEND;
+}
+
+static void
+nv50_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_blend_stateobj *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+static INLINE unsigned
+wrap_mode(unsigned wrap)
+{
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		return NV50TSC_1_0_WRAPS_MIRROR_REPEAT;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_CLAMP:
+		return NV50TSC_1_0_WRAPS_CLAMP;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	}
+}
+static void *
+nv50_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	unsigned *tsc = CALLOC(8, sizeof(unsigned));
+
+	tsc[0] = (0x00024000 |
+		  (wrap_mode(cso->wrap_s) << 0) |
+		  (wrap_mode(cso->wrap_t) << 3) |
+		  (wrap_mode(cso->wrap_r) << 6));
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MAGF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MAGF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MINF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MINF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_mip_filter) {
+	case PIPE_TEX_MIPFILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MIPF_LINEAR;
+		break;
+	case PIPE_TEX_MIPFILTER_NEAREST:
+		tsc[1] |= NV50TSC_1_1_MIPF_NEAREST;
+		break;
+	case PIPE_TEX_MIPFILTER_NONE:
+	default:
+		tsc[1] |= NV50TSC_1_1_MIPF_NONE;
+		break;
+	}
+
+	if (cso->max_anisotropy >= 16.0)
+		tsc[0] |= (7 << 20);
+	else
+	if (cso->max_anisotropy >= 12.0)
+		tsc[0] |= (6 << 20);
+	else
+	if (cso->max_anisotropy >= 10.0)
+		tsc[0] |= (5 << 20);
+	else
+	if (cso->max_anisotropy >= 8.0)
+		tsc[0] |= (4 << 20);
+	else
+	if (cso->max_anisotropy >= 6.0)
+		tsc[0] |= (3 << 20);
+	else
+	if (cso->max_anisotropy >= 4.0)
+		tsc[0] |= (2 << 20);
+	else
+	if (cso->max_anisotropy >= 2.0)
+		tsc[0] |= (1 << 20);
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		tsc[0] |= (1 << 8);
+		tsc[0] |= (nvgl_comparison_op(cso->compare_func) & 0x7);
+	}
+
+	return (void *)tsc;
+}
+
+static void
+nv50_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	int i;
+
+	nv50->sampler_nr = nr;
+	for (i = 0; i < nv50->sampler_nr; i++)
+		nv50->sampler[i] = sampler[i];
+
+	nv50->dirty |= NV50_NEW_SAMPLER;
+}
+
+static void
+nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv50_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **pt)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	int i;
+
+	for (i = 0; i < nr; i++)
+		pipe_texture_reference((void *)&nv50->miptree[i], pt[i]);
+	for (i = nr; i < nv50->miptree_nr; i++)
+		pipe_texture_reference((void *)&nv50->miptree[i], NULL);
+
+	nv50->miptree_nr = nr;
+	nv50->dirty |= NV50_NEW_TEXTURE;
+}
+
+static void *
+nv50_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_rasterizer_stateobj *rso =
+		CALLOC_STRUCT(nv50_rasterizer_stateobj);
+
+	/*XXX: ignored
+	 * 	- light_twosize
+	 * 	- point_smooth
+	 * 	- multisample
+	 * 	- point_sprite / sprite_coord_mode
+	 */
+
+	so_method(so, tesla, NV50TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV50TCL_SHADE_MODEL_FLAT :
+				       NV50TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, tesla, NV50TCL_LINE_WIDTH, 1);
+	so_data  (so, fui(cso->line_width));
+	so_method(so, tesla, NV50TCL_LINE_SMOOTH_ENABLE, 1);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	if (cso->line_stipple_enable) {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_PATTERN, 1);
+		so_data  (so, (cso->line_stipple_pattern << 8) |
+			       cso->line_stipple_factor);
+	} else {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, tesla, NV50TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_CULL_FACE_ENABLE, 3);
+	so_data  (so, cso->cull_mode != PIPE_WINDING_NONE);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, NV50TCL_FRONT_FACE_CCW);
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV50TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV50TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		}
+	} else {
+		so_data(so, NV50TCL_FRONT_FACE_CW);
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV50TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV50TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		}
+	}
+
+	so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_FACTOR, 1);
+		so_data  (so, fui(cso->offset_scale));
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_UNITS, 1);
+		so_data  (so, fui(cso->offset_units));
+	}
+
+	rso->pipe = *cso;
+	so_ref(so, &rso->so);
+	return (void *)rso;
+}
+
+static void
+nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->rasterizer = hwcso;
+	nv50->dirty |= NV50_NEW_RASTERIZER;
+}
+
+static void
+nv50_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_rasterizer_stateobj *rso = hwcso;
+
+	so_ref(NULL, &rso->so);
+	FREE(rso);
+}
+
+static void *
+nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_zsa_stateobj *zsa = CALLOC_STRUCT(nv50_zsa_stateobj);
+	struct nouveau_stateobj *so = so_new(64, 0);
+
+	so_method(so, tesla, NV50TCL_DEPTH_WRITE_ENABLE, 1);
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	if (cso->depth.enabled) {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_FUNC, 1);
+		so_data  (so, nvgl_comparison_op(cso->depth.func));
+	} else {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	/*XXX: yes, I know they're backwards.. header needs fixing */
+	if (cso->stencil[0].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 5);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_FUNC_REF, 3);
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].writemask);
+		so_data  (so, cso->stencil[0].valuemask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].writemask);
+		so_data  (so, cso->stencil[1].valuemask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->alpha.enabled) {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_REF, 2);
+		so_data  (so, fui(cso->alpha.ref_value));
+		so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	} else {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	zsa->pipe = *cso;
+	so_ref(so, &zsa->so);
+	return (void *)zsa;
+}
+
+static void
+nv50_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->zsa = hwcso;
+	nv50->dirty |= NV50_NEW_ZSA;
+}
+
+static void
+nv50_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_zsa_stateobj *zsa = hwcso;
+
+	so_ref(NULL, &zsa->so);
+	FREE(zsa);
+}
+
+static void *
+nv50_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_VERTEX;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->vertprog = hwcso;
+	nv50->dirty |= NV50_NEW_VERTPROG;
+}
+
+static void
+nv50_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void*)p->pipe.tokens);
+	FREE(p);
+}
+
+static void *
+nv50_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_FRAGMENT;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->fragprog = hwcso;
+	nv50->dirty |= NV50_NEW_FRAGPROG;
+}
+
+static void
+nv50_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void*)p->pipe.tokens);
+	FREE(p);
+}
+
+static void
+nv50_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend_colour = *bcol;
+	nv50->dirty |= NV50_NEW_BLEND_COLOUR;
+}
+
+static void
+nv50_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv50->constbuf[PIPE_SHADER_VERTEX] = buf->buffer;
+		nv50->dirty |= NV50_NEW_VERTPROG_CB;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv50->constbuf[PIPE_SHADER_FRAGMENT] = buf->buffer;
+		nv50->dirty |= NV50_NEW_FRAGPROG_CB;
+	}
+}
+
+static void
+nv50_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->framebuffer = *fb;
+	nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+}
+
+static void
+nv50_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->stipple = *stipple;
+	nv50->dirty |= NV50_NEW_STIPPLE;
+}
+
+static void
+nv50_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->scissor = *s;
+	nv50->dirty |= NV50_NEW_SCISSOR;
+}
+
+static void
+nv50_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->viewport = *vpt;
+	nv50->dirty |= NV50_NEW_VIEWPORT;
+}
+
+static void
+nv50_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->vtxbuf, vb, sizeof(*vb) * count);
+	nv50->vtxbuf_nr = count;
+
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+static void
+nv50_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->vtxelt, ve, sizeof(*ve) * count);
+	nv50->vtxelt_nr = count;
+
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+void
+nv50_init_state_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_blend_state = nv50_blend_state_create;
+	nv50->pipe.bind_blend_state = nv50_blend_state_bind;
+	nv50->pipe.delete_blend_state = nv50_blend_state_delete;
+
+	nv50->pipe.create_sampler_state = nv50_sampler_state_create;
+	nv50->pipe.bind_sampler_states = nv50_sampler_state_bind;
+	nv50->pipe.delete_sampler_state = nv50_sampler_state_delete;
+	nv50->pipe.set_sampler_textures = nv50_set_sampler_texture;
+
+	nv50->pipe.create_rasterizer_state = nv50_rasterizer_state_create;
+	nv50->pipe.bind_rasterizer_state = nv50_rasterizer_state_bind;
+	nv50->pipe.delete_rasterizer_state = nv50_rasterizer_state_delete;
+
+	nv50->pipe.create_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_create;
+	nv50->pipe.bind_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_bind;
+	nv50->pipe.delete_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_delete;
+
+	nv50->pipe.create_vs_state = nv50_vp_state_create;
+	nv50->pipe.bind_vs_state = nv50_vp_state_bind;
+	nv50->pipe.delete_vs_state = nv50_vp_state_delete;
+
+	nv50->pipe.create_fs_state = nv50_fp_state_create;
+	nv50->pipe.bind_fs_state = nv50_fp_state_bind;
+	nv50->pipe.delete_fs_state = nv50_fp_state_delete;
+
+	nv50->pipe.set_blend_color = nv50_set_blend_color;
+	nv50->pipe.set_clip_state = nv50_set_clip_state;
+	nv50->pipe.set_constant_buffer = nv50_set_constant_buffer;
+	nv50->pipe.set_framebuffer_state = nv50_set_framebuffer_state;
+	nv50->pipe.set_polygon_stipple = nv50_set_polygon_stipple;
+	nv50->pipe.set_scissor_state = nv50_set_scissor_state;
+	nv50->pipe.set_viewport_state = nv50_set_viewport_state;
+
+	nv50->pipe.set_vertex_buffers = nv50_set_vertex_buffers;
+	nv50->pipe.set_vertex_elements = nv50_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
new file mode 100644
index 0000000000..948112ffa9
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "nouveau/nouveau_stateobj.h"
+
+static void
+nv50_state_validate_fb(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(128, 18);
+	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	unsigned i, w, h, gw = 0;
+
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if (!gw) {
+			w = fb->cbufs[i]->width;
+			h = fb->cbufs[i]->height;
+			gw = 1;
+		} else {
+			assert(w == fb->cbufs[i]->width);
+			assert(h == fb->cbufs[i]->height);
+		}
+
+		so_method(so, tesla, NV50TCL_RT_HORIZ(i), 2);
+		so_data  (so, fb->cbufs[i]->width);
+		so_data  (so, fb->cbufs[i]->height);
+
+		so_method(so, tesla, NV50TCL_RT_ADDRESS_HIGH(i), 5);
+		so_reloc (so, nv50_surface_buffer(fb->cbufs[i]), fb->cbufs[i]->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
+			  NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, nv50_surface_buffer(fb->cbufs[i]), fb->cbufs[i]->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+			  NOUVEAU_BO_RDWR, 0, 0);
+		switch (fb->cbufs[i]->format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+			so_data(so, 0xcf);
+			break;
+		case PIPE_FORMAT_R5G6B5_UNORM:
+			so_data(so, 0xe8);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+				    pf_name(fb->cbufs[i]->format));
+			so_data(so, 0xe6);
+			break;
+		}
+		so_data(so, 0x00000000);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, 0x1224, 1);
+		so_data  (so, 1);
+	}
+
+	if (fb->zsbuf) {
+		if (!gw) {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+			gw = 1;
+		} else {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		}
+
+		so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5);
+		so_reloc (so, nv50_surface_buffer(fb->zsbuf), fb->zsbuf->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
+			  NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, nv50_surface_buffer(fb->zsbuf), fb->zsbuf->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+			  NOUVEAU_BO_RDWR, 0, 0);
+		switch (fb->zsbuf->format) {
+		case PIPE_FORMAT_Z24S8_UNORM:
+			so_data(so, 0x16);
+			break;
+		case PIPE_FORMAT_Z16_UNORM:
+			so_data(so, 0x15);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+				    pf_name(fb->zsbuf->format));
+			so_data(so, 0x16);
+			break;
+		}
+		so_data(so, 0x00000000);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, 0x1538, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, 0x1228, 3);
+		so_data  (so, fb->zsbuf->width);
+		so_data  (so, fb->zsbuf->height);
+		so_data  (so, 0x00010001);
+	}
+
+	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+	so_method(so, tesla, 0x0e04, 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+	so_method(so, tesla, 0xdf8, 2);
+	so_data  (so, 0);
+	so_data  (so, h);
+
+	so_ref(so, &nv50->state.fb);
+}
+
+static void
+nv50_state_emit(struct nv50_context *nv50)
+{
+	struct nv50_screen *screen = nv50->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	if (nv50->pctx_id != screen->cur_pctx) {
+		nv50->state.dirty |= 0xffffffff;
+		screen->cur_pctx = nv50->pctx_id;
+	}
+
+	if (nv50->state.dirty & NV50_NEW_FRAMEBUFFER)
+		so_emit(nvws, nv50->state.fb);
+	if (nv50->state.dirty & NV50_NEW_BLEND)
+		so_emit(nvws, nv50->state.blend);
+	if (nv50->state.dirty & NV50_NEW_ZSA)
+		so_emit(nvws, nv50->state.zsa);
+	if (nv50->state.dirty & NV50_NEW_VERTPROG)
+		so_emit(nvws, nv50->state.vertprog);
+	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
+		so_emit(nvws, nv50->state.fragprog);
+	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
+		so_emit(nvws, nv50->state.rast);
+	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
+		so_emit(nvws, nv50->state.blend_colour);
+	if (nv50->state.dirty & NV50_NEW_STIPPLE)
+		so_emit(nvws, nv50->state.stipple);
+	if (nv50->state.dirty & NV50_NEW_SCISSOR)
+		so_emit(nvws, nv50->state.scissor);
+	if (nv50->state.dirty & NV50_NEW_VIEWPORT)
+		so_emit(nvws, nv50->state.viewport);
+	if (nv50->state.dirty & NV50_NEW_SAMPLER)
+		so_emit(nvws, nv50->state.tsc_upload);
+	if (nv50->state.dirty & NV50_NEW_TEXTURE)
+		so_emit(nvws, nv50->state.tic_upload);
+	if (nv50->state.dirty & NV50_NEW_ARRAYS) {
+		so_emit(nvws, nv50->state.vtxfmt);
+		so_emit(nvws, nv50->state.vtxbuf);
+	}
+	nv50->state.dirty = 0;
+
+	so_emit_reloc_markers(nvws, nv50->state.fb);
+	so_emit_reloc_markers(nvws, nv50->state.vertprog);
+	so_emit_reloc_markers(nvws, nv50->state.fragprog);
+	so_emit_reloc_markers(nvws, nv50->state.vtxbuf);
+	so_emit_reloc_markers(nvws, nv50->screen->static_init);
+}
+
+boolean
+nv50_state_validate(struct nv50_context *nv50)
+{
+	const struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	unsigned i;
+
+	for (i = 0; i < fb->nr_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	if (nv50->dirty & NV50_NEW_FRAMEBUFFER)
+		nv50_state_validate_fb(nv50);
+
+	if (nv50->dirty & NV50_NEW_BLEND)
+		so_ref(nv50->blend->so, &nv50->state.blend);
+
+	if (nv50->dirty & NV50_NEW_ZSA)
+		so_ref(nv50->zsa->so, &nv50->state.zsa);
+
+	if (nv50->dirty & (NV50_NEW_VERTPROG | NV50_NEW_VERTPROG_CB))
+		nv50_vertprog_validate(nv50);
+
+	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
+		nv50_fragprog_validate(nv50);
+
+	if (nv50->dirty & NV50_NEW_RASTERIZER)
+		so_ref(nv50->rasterizer->so, &nv50->state.rast);
+
+	if (nv50->dirty & NV50_NEW_BLEND_COLOUR) {
+		so = so_new(5, 0);
+		so_method(so, tesla, NV50TCL_BLEND_COLOR(0), 4);
+		so_data  (so, fui(nv50->blend_colour.color[0]));
+		so_data  (so, fui(nv50->blend_colour.color[1]));
+		so_data  (so, fui(nv50->blend_colour.color[2]));
+		so_data  (so, fui(nv50->blend_colour.color[3]));
+		so_ref(so, &nv50->state.blend_colour);
+	}
+
+	if (nv50->dirty & NV50_NEW_STIPPLE) {
+		so = so_new(33, 0);
+		so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv50->stipple.stipple[i]);
+		so_ref(so, &nv50->state.stipple);
+	}
+
+	if (nv50->dirty & (NV50_NEW_SCISSOR | NV50_NEW_RASTERIZER)) {
+		struct pipe_rasterizer_state *rast = &nv50->rasterizer->pipe;
+		struct pipe_scissor_state *s = &nv50->scissor;
+
+		if (nv50->state.scissor &&
+		    (rast->scissor == 0 && nv50->state.scissor_enabled == 0))
+			goto scissor_uptodate;
+		nv50->state.scissor_enabled = rast->scissor;
+
+		so = so_new(3, 0);
+		so_method(so, tesla, 0x0ff4, 2);
+		if (nv50->state.scissor_enabled) {
+			so_data(so, ((s->maxx - s->minx) << 16) | s->minx);
+			so_data(so, ((s->maxy - s->miny) << 16) | s->miny);
+		} else {
+			so_data(so, (8192 << 16));
+			so_data(so, (8192 << 16));
+		}
+		so_ref(so, &nv50->state.scissor);
+		nv50->state.dirty |= NV50_NEW_SCISSOR;
+	}
+scissor_uptodate:
+
+	if (nv50->dirty & NV50_NEW_VIEWPORT) {
+		unsigned bypass;
+
+		if (!nv50->rasterizer->pipe.bypass_clipping)
+			bypass = 0;
+		else
+			bypass = 1;
+
+		if (nv50->state.viewport &&
+		    (bypass || !(nv50->dirty & NV50_NEW_VIEWPORT)) &&
+		    nv50->state.viewport_bypass == bypass)
+			goto viewport_uptodate;
+		nv50->state.viewport_bypass = bypass;
+
+		so = so_new(12, 0);
+		if (!bypass) {
+			so_method(so, tesla, NV50TCL_VIEWPORT_UNK1(0), 3);
+			so_data  (so, fui(nv50->viewport.translate[0]));
+			so_data  (so, fui(nv50->viewport.translate[1]));
+			so_data  (so, fui(nv50->viewport.translate[2]));
+			so_method(so, tesla, NV50TCL_VIEWPORT_UNK0(0), 3);
+			so_data  (so, fui(nv50->viewport.scale[0]));
+			so_data  (so, fui(-nv50->viewport.scale[1]));
+			so_data  (so, fui(nv50->viewport.scale[2]));
+			so_method(so, tesla, 0x192c, 1);
+			so_data  (so, 1);
+			so_method(so, tesla, 0x0f90, 1);
+			so_data  (so, 0);
+		} else {
+			so_method(so, tesla, 0x192c, 1);
+			so_data  (so, 0);
+			so_method(so, tesla, 0x0f90, 1);
+			so_data  (so, 1);
+		}
+
+		so_ref(so, &nv50->state.viewport);
+	}
+viewport_uptodate:
+
+	if (nv50->dirty & NV50_NEW_SAMPLER) {
+		int i;
+
+		so = so_new(nv50->sampler_nr * 8 + 3, 0);
+		so_method(so, tesla, 0x0f00, 1);
+		so_data  (so, NV50_CB_TSC);
+		so_method(so, tesla, 0x40000f04, nv50->sampler_nr * 8);
+		for (i = 0; i < nv50->sampler_nr; i++)
+			so_datap (so, nv50->sampler[i], 8);
+		so_ref(so, &nv50->state.tsc_upload);
+	}
+
+	if (nv50->dirty & NV50_NEW_TEXTURE)
+		nv50_tex_validate(nv50);
+
+	if (nv50->dirty & NV50_NEW_ARRAYS)
+		nv50_vbo_validate(nv50);
+
+	nv50->state.dirty |= nv50->dirty;
+	nv50->dirty = 0;
+	nv50_state_emit(nv50);
+
+	return TRUE;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
new file mode 100644
index 0000000000..f2dd2eb30b
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define __NOUVEAU_PUSH_H__
+#include <stdint.h>
+#include "nouveau/nouveau_pushbuf.h"
+#include "nv50_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_tile.h"
+
+static INLINE int
+nv50_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV50_2D_DST_FORMAT_32BPP;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		return NV50_2D_DST_FORMAT_24BPP;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV50_2D_DST_FORMAT_16BPP;
+	case PIPE_FORMAT_A8_UNORM:
+		return NV50_2D_DST_FORMAT_8BPP;
+	default:
+		return -1;
+	}
+}
+
+static int
+nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
+{
+	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_grobj *eng2d = screen->eng2d;
+	struct nouveau_bo *bo;
+ 	int format, mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
+ 	int flags = NOUVEAU_BO_VRAM | (dst ? NOUVEAU_BO_WR : NOUVEAU_BO_RD);
+ 
+	bo = screen->nvws->get_bo(nv50_miptree(ps->texture)->buffer);
+	if (!bo)
+		return 1;
+
+ 	format = nv50_format(ps->format);
+ 	if (format < 0)
+ 		return 1;
+  
+ 	if (!bo->tiled) {
+ 		BEGIN_RING(chan, eng2d, mthd, 2);
+ 		OUT_RING  (chan, format);
+ 		OUT_RING  (chan, 1);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
+ 		OUT_RING  (chan, ps->stride);
+ 		OUT_RING  (chan, ps->width);
+ 		OUT_RING  (chan, ps->height);
+ 		OUT_RELOCh(chan, bo, ps->offset, flags);
+ 		OUT_RELOCl(chan, bo, ps->offset, flags);
+ 	} else {
+ 		BEGIN_RING(chan, eng2d, mthd, 5);
+ 		OUT_RING  (chan, format);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 1);
+ 		OUT_RING  (chan, 0);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
+ 		OUT_RING  (chan, ps->width);
+ 		OUT_RING  (chan, ps->height);
+ 		OUT_RELOCh(chan, bo, ps->offset, flags);
+ 		OUT_RELOCl(chan, bo, ps->offset, flags);
+ 	}
+ 
+#if 0
+ 	if (dst) {
+ 		BEGIN_RING(chan, eng2d, NV50_2D_CLIP_X, 4);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, surf->width);
+ 		OUT_RING  (chan, surf->height);
+ 	}
+#endif
+  
+ 	return 0;
+}
+
+int
+nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h)
+{
+	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_grobj *eng2d = screen->eng2d;
+	int ret;
+
+	WAIT_RING (chan, 32);
+
+	ret = nv50_surface_set(screen, dst, 1);
+	if (ret)
+		return ret;
+
+	ret = nv50_surface_set(screen, src, 0);
+	if (ret)
+		return ret;
+
+	BEGIN_RING(chan, eng2d, 0x088c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, eng2d, NV50_2D_BLIT_DST_X, 4);
+	OUT_RING  (chan, dx);
+	OUT_RING  (chan, dy);
+	OUT_RING  (chan, w);
+	OUT_RING  (chan, h);
+	BEGIN_RING(chan, eng2d, 0x08c0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, eng2d, 0x08d0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sx);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sy);
+
+	return 0;
+}
+
+static void
+nv50_surface_copy(struct pipe_context *pipe, boolean flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nv50_screen *screen = nv50->screen;
+
+	assert(src->format == dest->format);
+
+	if (flip) {
+		desty += height;
+		while (height--) {
+			nv50_surface_do_copy(screen, dest, destx, desty--, src,
+					     srcx, srcy++, width, 1);
+		}
+	} else {
+		nv50_surface_do_copy(screen, dest, destx, desty, src, srcx,
+				     srcy, width, height);
+	}
+}
+
+static void
+nv50_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nv50_screen *screen = nv50->screen;
+	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_grobj *eng2d = screen->eng2d;
+	int format, ret;
+
+	format = nv50_format(dest->format);
+	if (format < 0)
+		return;
+
+	WAIT_RING (chan, 32);
+
+	ret = nv50_surface_set(screen, dest, 1);
+	if (ret)
+		return;
+
+	BEGIN_RING(chan, eng2d, 0x0580, 3);
+	OUT_RING  (chan, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, eng2d, NV50_2D_RECT_X1, 4);
+	OUT_RING  (chan, destx);
+	OUT_RING  (chan, desty);
+	OUT_RING  (chan, width);
+	OUT_RING  (chan, height);
+}
+
+static void *
+nv50_surface_map(struct pipe_screen *screen, struct pipe_surface *ps,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+
+	return ws->buffer_map(ws, nv50_surface_buffer(ps), flags);
+}
+
+static void
+nv50_surface_unmap(struct pipe_screen *pscreen, struct pipe_surface *ps)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+
+	ws->buffer_unmap(ws, nv50_surface_buffer(ps));
+}
+
+void
+nv50_init_surface_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.surface_copy = nv50_surface_copy;
+	nv50->pipe.surface_fill = nv50_surface_fill;
+}
+
+void
+nv50_surface_init_screen_functions(struct pipe_screen *pscreen)
+{
+	pscreen->surface_map = nv50_surface_map;
+	pscreen->surface_unmap = nv50_surface_unmap;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
new file mode 100644
index 0000000000..675f9b20cb
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static int
+nv50_tex_construct(struct nouveau_stateobj *so, struct nv50_miptree *mt)
+{
+	switch (mt->base.format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8_8_8_8);
+		break;
+	case PIPE_FORMAT_A1R5G5B5_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_1_5_5_5);
+		break;
+	case PIPE_FORMAT_A4R4G4B4_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_4_4_4_4);
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_5_6_5);
+		break;
+	case PIPE_FORMAT_L8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_A8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_ZERO | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_ZERO | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_ZERO | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_I8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_A8L8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C1 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8_8);
+		break;
+	case PIPE_FORMAT_DXT1_RGB:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT1);
+		break;
+	case PIPE_FORMAT_DXT1_RGBA:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT1);
+		break;
+	case PIPE_FORMAT_DXT3_RGBA:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT3);
+		break;
+	case PIPE_FORMAT_DXT5_RGBA:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT5);
+		break;
+	default:
+		return 1;
+	}
+
+	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+		     NOUVEAU_BO_RD, 0, 0);
+	so_data (so, 0xd0005000);
+	so_data (so, 0x00300000);
+	so_data (so, mt->base.width[0]);
+	so_data (so, (mt->base.depth[0] << 16) | mt->base.height[0]);
+	so_data (so, 0x03000000);
+	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
+		     NOUVEAU_BO_RD, 0, 0);
+
+	return 0;
+}
+
+void
+nv50_tex_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	int unit, level, image;
+
+	so = so_new(nv50->miptree_nr * 8 + 3, nv50->miptree_nr * 2);
+	so_method(so, tesla, 0x0f00, 1);
+	so_data  (so, NV50_CB_TIC);
+	so_method(so, tesla, 0x40000f04, nv50->miptree_nr * 8);
+	for (unit = 0; unit < nv50->miptree_nr; unit++) {
+		struct nv50_miptree *mt = nv50->miptree[unit];
+
+		for (level = 0; level <= mt->base.last_level; level++) {
+			for (image = 0; image < mt->image_nr; image++) {
+				nv50_miptree_sync(&nv50->screen->pipe, mt,
+						  level, image);
+			}
+		}
+
+		if (nv50_tex_construct(so, mt)) {
+			NOUVEAU_ERR("failed tex validate\n");
+			so_ref(NULL, &so);
+			return;
+		}
+	}
+
+	so_ref(so, &nv50->state.tic_upload);
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
new file mode 100644
index 0000000000..aca622c73b
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -0,0 +1,129 @@
+#ifndef __NV50_TEXTURE_H__
+#define __NV50_TEXTURE_H__
+
+/* It'd be really nice to have these in nouveau_class.h generated by
+ * renouveau like the rest of the object header - but not sure it can
+ * handle non-object stuff nicely - need to look into it.
+ */
+
+/* Texture image control block */
+#define NV50TIC_0_0_MAPA_MASK                                     0x38000000
+#define NV50TIC_0_0_MAPA_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPA_C0                                       0x10000000
+#define NV50TIC_0_0_MAPA_C1                                       0x18000000
+#define NV50TIC_0_0_MAPA_C2                                       0x20000000
+#define NV50TIC_0_0_MAPA_C3                                       0x28000000
+#define NV50TIC_0_0_MAPA_ONE                                      0x38000000
+#define NV50TIC_0_0_MAPR_MASK                                     0x07000000
+#define NV50TIC_0_0_MAPR_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPR_C0                                       0x02000000
+#define NV50TIC_0_0_MAPR_C1                                       0x03000000
+#define NV50TIC_0_0_MAPR_C2                                       0x04000000
+#define NV50TIC_0_0_MAPR_C3                                       0x05000000
+#define NV50TIC_0_0_MAPR_ONE                                      0x07000000
+#define NV50TIC_0_0_MAPG_MASK                                     0x00e00000
+#define NV50TIC_0_0_MAPG_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPG_C0                                       0x00400000
+#define NV50TIC_0_0_MAPG_C1                                       0x00600000
+#define NV50TIC_0_0_MAPG_C2                                       0x00800000
+#define NV50TIC_0_0_MAPG_C3                                       0x00a00000
+#define NV50TIC_0_0_MAPG_ONE                                      0x00e00000
+#define NV50TIC_0_0_MAPB_MASK                                     0x001c0000
+#define NV50TIC_0_0_MAPB_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPB_C0                                       0x00080000
+#define NV50TIC_0_0_MAPB_C1                                       0x000c0000
+#define NV50TIC_0_0_MAPB_C2                                       0x00100000
+#define NV50TIC_0_0_MAPB_C3                                       0x00140000
+#define NV50TIC_0_0_MAPB_ONE                                      0x001c0000
+#define NV50TIC_0_0_TYPEA_MASK                                    0x00038000
+#define NV50TIC_0_0_TYPEA_UNORM                                   0x00010000
+#define NV50TIC_0_0_TYPER_MASK                                    0x00007000
+#define NV50TIC_0_0_TYPER_UNORM                                   0x00002000
+#define NV50TIC_0_0_TYPEG_MASK                                    0x00000e00
+#define NV50TIC_0_0_TYPEG_UNORM                                   0x00000400
+#define NV50TIC_0_0_TYPEB_MASK                                    0x000001c0
+#define NV50TIC_0_0_TYPEB_UNORM                                   0x00000080
+#define NV50TIC_0_0_FMT_MASK                                      0x0000003c
+#define NV50TIC_0_0_FMT_8_8_8_8                                   0x00000008
+#define NV50TIC_0_0_FMT_4_4_4_4                                   0x00000012
+#define NV50TIC_0_0_FMT_1_5_5_5                                   0x00000013
+#define NV50TIC_0_0_FMT_5_6_5                                     0x00000015
+#define NV50TIC_0_0_FMT_8_8                                       0x00000018
+#define NV50TIC_0_0_FMT_8                                         0x0000001d
+#define NV50TIC_0_0_FMT_DXT1                                      0x00000024
+#define NV50TIC_0_0_FMT_DXT3                                      0x00000025
+#define NV50TIC_0_0_FMT_DXT5                                      0x00000026
+
+#define NV50TIC_0_1_OFFSET_LOW_MASK                               0xffffffff
+#define NV50TIC_0_1_OFFSET_LOW_SHIFT                                       0
+
+#define NV50TIC_0_2_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_3_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_4_WIDTH_MASK                                    0x0000ffff
+#define NV50TIC_0_4_WIDTH_SHIFT                                            0
+
+#define NV50TIC_0_5_DEPTH_MASK                                    0xffff0000
+#define NV50TIC_0_5_DEPTH_SHIFT                                           16
+#define NV50TIC_0_5_HEIGHT_MASK                                   0x0000ffff
+#define NV50TIC_0_5_HEIGHT_SHIFT                                           0
+
+#define NV50TIC_0_6_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_7_OFFSET_HIGH_MASK                              0xffffffff
+#define NV50TIC_0_7_OFFSET_HIGH_SHIFT                                      0
+
+/* Texture sampler control block */
+#define NV50TSC_1_0_WRAPS_MASK                                   0x00000007
+#define NV50TSC_1_0_WRAPS_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPS_MIRROR_REPEAT                          0x00000001
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE                          0x00000002
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER                        0x00000003
+#define NV50TSC_1_0_WRAPS_CLAMP                                  0x00000004
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE                   0x00000005
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER                 0x00000006
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP                           0x00000007
+#define NV50TSC_1_0_WRAPT_MASK                                   0x00000038
+#define NV50TSC_1_0_WRAPT_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPT_MIRROR_REPEAT                          0x00000008
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_EDGE                          0x00000010
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_BORDER                        0x00000018
+#define NV50TSC_1_0_WRAPT_CLAMP                                  0x00000020
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_EDGE                   0x00000028
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_BORDER                 0x00000030
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP                           0x00000038
+#define NV50TSC_1_0_WRAPR_MASK                                   0x000001c0
+#define NV50TSC_1_0_WRAPR_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPR_MIRROR_REPEAT                          0x00000040
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_EDGE                          0x00000080
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_BORDER                        0x000000c0
+#define NV50TSC_1_0_WRAPR_CLAMP                                  0x00000100
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_EDGE                   0x00000140
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_BORDER                 0x00000180
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP                           0x000001c0
+
+#define NV50TSC_1_1_MAGF_MASK                                    0x00000003
+#define NV50TSC_1_1_MAGF_NEAREST                                 0x00000001
+#define NV50TSC_1_1_MAGF_LINEAR                                  0x00000002
+#define NV50TSC_1_1_MINF_MASK                                    0x00000030
+#define NV50TSC_1_1_MINF_NEAREST                                 0x00000010
+#define NV50TSC_1_1_MINF_LINEAR                                  0x00000020
+#define NV50TSC_1_1_MIPF_MASK                                    0x000000c0
+#define NV50TSC_1_1_MIPF_NONE                                    0x00000040
+#define NV50TSC_1_1_MIPF_NEAREST                                 0x00000080
+#define NV50TSC_1_1_MIPF_LINEAR                                  0x000000c0
+
+#define NV50TSC_1_2_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_3_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_4_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_5_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_6_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_7_UNKNOWN_MASK                                 0xffffffff
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
new file mode 100644
index 0000000000..0c970adb03
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv50_context.h"
+
+static INLINE unsigned
+nv50_prim(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_PRIM_POINTS: return NV50TCL_VERTEX_BEGIN_POINTS;
+	case PIPE_PRIM_LINES: return NV50TCL_VERTEX_BEGIN_LINES;
+	case PIPE_PRIM_LINE_LOOP: return NV50TCL_VERTEX_BEGIN_LINE_LOOP;
+	case PIPE_PRIM_LINE_STRIP: return NV50TCL_VERTEX_BEGIN_LINE_STRIP;
+	case PIPE_PRIM_TRIANGLES: return NV50TCL_VERTEX_BEGIN_TRIANGLES;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP;
+	case PIPE_PRIM_TRIANGLE_FAN: return NV50TCL_VERTEX_BEGIN_TRIANGLE_FAN;
+	case PIPE_PRIM_QUADS: return NV50TCL_VERTEX_BEGIN_QUADS;
+	case PIPE_PRIM_QUAD_STRIP: return NV50TCL_VERTEX_BEGIN_QUAD_STRIP;
+	case PIPE_PRIM_POLYGON: return NV50TCL_VERTEX_BEGIN_POLYGON;
+	default:
+		break;
+	}
+
+	NOUVEAU_ERR("invalid primitive type %d\n", mode);
+	return NV50TCL_VERTEX_BEGIN_POINTS;
+}
+
+boolean
+nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
+		 unsigned count)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	nv50_state_validate(nv50);
+
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, tesla, 0x1440, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, tesla, 0x1334, 1);
+	OUT_RING  (chan, 0);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (chan, nv50_prim(mode));
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+	OUT_RING  (chan, start);
+	OUT_RING  (chan, count);
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (chan, 0);
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
+			      unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	map += start;
+
+	if (count & 1) {
+		BEGIN_RING(chan, tesla, 0x15e8, 1);
+		OUT_RING  (chan, map[0]);
+		map++;
+		count--;
+	}
+
+	while (count) {
+		unsigned nr = count > 2046 ? 2046 : count;
+		int i;
+
+		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+		for (i = 0; i < nr; i += 2)
+			OUT_RING  (chan, (map[1] << 16) | map[0]);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+static INLINE void
+nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
+			      unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	map += start;
+
+	if (count & 1) {
+		BEGIN_RING(chan, tesla, 0x15e8, 1);
+		OUT_RING  (chan, map[0]);
+		map++;
+		count--;
+	}
+
+	while (count) {
+		unsigned nr = count > 2046 ? 2046 : count;
+		int i;
+
+		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+		for (i = 0; i < nr; i += 2)
+			OUT_RING  (chan, (map[1] << 16) | map[0]);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+static INLINE void
+nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint8_t *map,
+			      unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	map += start;
+
+	while (count) {
+		unsigned nr = count > 2047 ? 2047 : count;
+
+		BEGIN_RING(chan, tesla, 0x400015e8, nr);
+		OUT_RINGp (chan, map, nr);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+boolean
+nv50_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map = ws->buffer_map(ws, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
+
+	nv50_state_validate(nv50);
+
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (chan, nv50_prim(mode));
+	switch (indexSize) {
+	case 1:
+		nv50_draw_elements_inline_u08(nv50, map, start, count);
+		break;
+	case 2:
+		nv50_draw_elements_inline_u16(nv50, map, start, count);
+		break;
+	case 4:
+		nv50_draw_elements_inline_u32(nv50, map, start, count);
+		break;
+	default:
+		assert(0);
+	}
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (chan, 0);
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+void
+nv50_vbo_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *vtxbuf, *vtxfmt;
+	int i, vpi = 0;
+
+	vtxbuf = so_new(nv50->vtxelt_nr * 4, nv50->vtxelt_nr * 2);
+	vtxfmt = so_new(nv50->vtxelt_nr + 1, 0);
+	so_method(vtxfmt, tesla, 0x1ac0, nv50->vtxelt_nr);
+
+	for (i = 0; i < nv50->vtxelt_nr; i++) {
+		struct pipe_vertex_element *ve = &nv50->vtxelt[i];
+		struct pipe_vertex_buffer *vb =
+			&nv50->vtxbuf[ve->vertex_buffer_index];
+
+		switch (ve->src_format) {
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+			so_data(vtxfmt, 0x7e080000 | i);
+			break;
+		case PIPE_FORMAT_R32G32B32_FLOAT:
+			so_data(vtxfmt, 0x7e100000 | i);
+			break;
+		case PIPE_FORMAT_R32G32_FLOAT:
+			so_data(vtxfmt, 0x7e200000 | i);
+			break;
+		case PIPE_FORMAT_R32_FLOAT:
+			so_data(vtxfmt, 0x7e900000 | i);
+			break;
+		case PIPE_FORMAT_R8G8B8A8_UNORM:
+			so_data(vtxfmt, 0x24500000 | i);
+			break;
+		default:
+		{
+			NOUVEAU_ERR("invalid vbo format %s\n",
+				    pf_name(ve->src_format));
+			assert(0);
+			return;
+		}
+		}
+
+		so_method(vtxbuf, tesla, 0x900 + (i * 16), 3);
+		so_data  (vtxbuf, 0x20000000 | vb->stride);
+		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	so_ref (vtxfmt, &nv50->state.vtxfmt);
+	so_ref (vtxbuf, &nv50->state.vtxbuf);
+}
+
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
new file mode 100644
index 0000000000..e83d943cd8
--- /dev/null
+++ b/src/gallium/drivers/r300/Makefile
@@ -0,0 +1,21 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = r300
+
+C_SOURCES = \
+	r300_chipset.c \
+	r300_clear.c \
+	r300_context.c \
+	r300_emit.c \
+	r300_flush.c \
+	r300_screen.c \
+	r300_state.c \
+	r300_state_shader.c \
+	r300_surface.c \
+	r300_swtcl_emit.c \
+	r300_texture.c
+
+include ../../Makefile.template
+
+symlinks:
diff --git a/src/gallium/drivers/r300/SConscript b/src/gallium/drivers/r300/SConscript
new file mode 100644
index 0000000000..18684c3e7f
--- /dev/null
+++ b/src/gallium/drivers/r300/SConscript
@@ -0,0 +1,17 @@
+Import('*')
+
+env = env.Clone()
+
+r300 = env.ConvenienceLibrary(
+	target = 'r300',
+	source = [
+		'r300_blit.c',
+		'r300_clear.c',
+		'r300_context.c',
+		'r300_screen.c',
+		'r300_state.c',
+		'r300_surface.c',
+	])
+
+Export('r300')
+
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
new file mode 100644
index 0000000000..7def62422a
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_chipset.h"
+#include "pipe/p_debug.h"
+
+/* r300_chipset: A file all to itself for deducing the various properties of
+ * Radeons. */
+
+/* Parse a PCI ID and fill an r300_capabilities struct with information. */
+void r300_parse_chipset(struct r300_capabilities* caps)
+{
+    /* Reasonable defaults */
+    caps->has_tcl = TRUE;
+    caps->is_r500 = FALSE;
+    caps->num_vert_fpus = 4;
+
+
+    /* Note: These are not ordered by PCI ID. I leave that task to GCC,
+     * which will perform the ordering while collating jump tables. Instead,
+     * I've tried to group them according to capabilities and age. */
+    switch (caps->pci_id) {
+        case 0x4144:
+            caps->family = CHIP_FAMILY_R300;
+            break;
+
+        case 0x4145:
+        case 0x4146:
+        case 0x4147:
+        case 0x4E44:
+        case 0x4E45:
+        case 0x4E46:
+        case 0x4E47:
+            caps->family = CHIP_FAMILY_R300;
+            break;
+
+        case 0x4150:
+        case 0x4151:
+        case 0x4152:
+        case 0x4153:
+        case 0x4154:
+        case 0x4155:
+        case 0x4156:
+        case 0x4E50:
+        case 0x4E51:
+        case 0x4E52:
+        case 0x4E53:
+        case 0x4E54:
+        case 0x4E56:
+            caps->family = CHIP_FAMILY_RV350;
+            break;
+
+        case 0x4148:
+        case 0x4149:
+        case 0x414A:
+        case 0x414B:
+        case 0x4E48:
+        case 0x4E49:
+        case 0x4E4B:
+            caps->family = CHIP_FAMILY_R350;
+            break;
+
+        case 0x4E4A:
+            caps->family = CHIP_FAMILY_R360;
+            break;
+
+        case 0x5460:
+        case 0x5462:
+        case 0x5464:
+        case 0x5B60:
+        case 0x5B62:
+        case 0x5B63:
+        case 0x5B64:
+        case 0x5B65:
+            caps->family = CHIP_FAMILY_RV370;
+            break;
+
+        case 0x3150:
+        case 0x3152:
+        case 0x3154:
+        case 0x3E50:
+        case 0x3E54:
+            caps->family = CHIP_FAMILY_RV380;
+            break;
+
+        case 0x4A48:
+        case 0x4A49:
+        case 0x4A4A:
+        case 0x4A4B:
+        case 0x4A4C:
+        case 0x4A4D:
+        case 0x4A4E:
+        case 0x4A4F:
+        case 0x4A50:
+        case 0x4A54:
+            caps->family = CHIP_FAMILY_R420;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x5548:
+        case 0x5549:
+        case 0x554A:
+        case 0x554B:
+        case 0x5550:
+        case 0x5551:
+        case 0x5552:
+        case 0x5554:
+        case 0x5D57:
+            caps->family = CHIP_FAMILY_R423;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x554C:
+        case 0x554D:
+        case 0x554E:
+        case 0x554F:
+        case 0x5D48:
+        case 0x5D49:
+        case 0x5D4A:
+            caps->family = CHIP_FAMILY_R430;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x5D4C:
+        case 0x5D4D:
+        case 0x5D4E:
+        case 0x5D4F:
+        case 0x5D50:
+        case 0x5D52:
+            caps->family = CHIP_FAMILY_R480;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x4B49:
+        case 0x4B4A:
+        case 0x4B4B:
+        case 0x4B4C:
+            caps->family = CHIP_FAMILY_R481;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x5E4C:
+        case 0x5E4F:
+        case 0x564A:
+        case 0x564B:
+        case 0x564F:
+        case 0x5652:
+        case 0x5653:
+        case 0x5657:
+        case 0x5E48:
+        case 0x5E4A:
+        case 0x5E4B:
+        case 0x5E4D:
+            caps->family = CHIP_FAMILY_RV410;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x5954:
+        case 0x5955:
+            caps->family = CHIP_FAMILY_RS480;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x5974:
+        case 0x5975:
+            caps->family = CHIP_FAMILY_RS482;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x5A41:
+        case 0x5A42:
+            caps->family = CHIP_FAMILY_RS400;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x5A61:
+        case 0x5A62:
+            caps->family = CHIP_FAMILY_RC410;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x791E:
+        case 0x791F:
+            caps->family = CHIP_FAMILY_RS690;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x796C:
+        case 0x796D:
+        case 0x796E:
+        case 0x796F:
+            caps->family = CHIP_FAMILY_RS740;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x7100:
+        case 0x7101:
+        case 0x7102:
+        case 0x7103:
+        case 0x7104:
+        case 0x7105:
+        case 0x7106:
+        case 0x7108:
+        case 0x7109:
+        case 0x710A:
+        case 0x710B:
+        case 0x710C:
+        case 0x710E:
+        case 0x710F:
+            caps->family = CHIP_FAMILY_R520;
+            caps->num_vert_fpus = 8;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7140:
+        case 0x7141:
+        case 0x7142:
+        case 0x7143:
+        case 0x7144:
+        case 0x7145:
+        case 0x7146:
+        case 0x7147:
+        case 0x7149:
+        case 0x714A:
+        case 0x714B:
+        case 0x714C:
+        case 0x714D:
+        case 0x714E:
+        case 0x714F:
+        case 0x7151:
+        case 0x7152:
+        case 0x7153:
+        case 0x715E:
+        case 0x715F:
+        case 0x7180:
+        case 0x7181:
+        case 0x7183:
+        case 0x7186:
+        case 0x7187:
+        case 0x7188:
+        case 0x718A:
+        case 0x718B:
+        case 0x718C:
+        case 0x718D:
+        case 0x718F:
+        case 0x7193:
+        case 0x7196:
+        case 0x719B:
+        case 0x719F:
+        case 0x7200:
+        case 0x7210:
+        case 0x7211:
+            caps->family = CHIP_FAMILY_RV515;
+            caps->num_vert_fpus = 2;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x71C0:
+        case 0x71C1:
+        case 0x71C2:
+        case 0x71C3:
+        case 0x71C4:
+        case 0x71C5:
+        case 0x71C6:
+        case 0x71C7:
+        case 0x71CD:
+        case 0x71CE:
+        case 0x71D2:
+        case 0x71D4:
+        case 0x71D5:
+        case 0x71D6:
+        case 0x71DA:
+        case 0x71DE:
+            caps->family = CHIP_FAMILY_RV530;
+            caps->num_vert_fpus = 5;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7240:
+        case 0x7243:
+        case 0x7244:
+        case 0x7245:
+        case 0x7246:
+        case 0x7247:
+        case 0x7248:
+        case 0x7249:
+        case 0x724A:
+        case 0x724B:
+        case 0x724C:
+        case 0x724D:
+        case 0x724E:
+        case 0x724F:
+        case 0x7284:
+            caps->family = CHIP_FAMILY_R580;
+            caps->num_vert_fpus = 8;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7280:
+            caps->family = CHIP_FAMILY_RV570;
+            caps->num_vert_fpus = 5;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7281:
+        case 0x7283:
+        case 0x7287:
+        case 0x7288:
+        case 0x7289:
+        case 0x728B:
+        case 0x728C:
+        case 0x7290:
+        case 0x7291:
+        case 0x7293:
+        case 0x7297:
+            caps->family = CHIP_FAMILY_RV560;
+            caps->num_vert_fpus = 5;
+            caps->is_r500 = TRUE;
+            break;
+
+        default:
+            debug_printf("r300: Warning: Unknown chipset 0x%x\n",
+                caps->pci_id);
+            break;
+    }
+
+    /* Force off TCL for now */
+    caps->has_tcl = FALSE;
+}
diff --git a/src/gallium/drivers/r300/r300_chipset.h b/src/gallium/drivers/r300/r300_chipset.h
new file mode 100644
index 0000000000..a9cd372ec5
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_chipset.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_CHIPSET_H
+#define R300_CHIPSET_H
+
+#include "pipe/p_compiler.h"
+
+/* Structure containing all the possible information about a specific Radeon
+ * in the R3xx, R4xx, and R5xx families. */
+struct r300_capabilities {
+    /* PCI ID */
+    uint32_t pci_id;
+    /* Chipset family */
+    int family;
+    /* The number of vertex floating-point units */
+    int num_vert_fpus;
+    /* The number of fragment pipes */
+    int num_frag_pipes;
+    /* Whether or not TCL is physically present */
+    boolean has_tcl;
+    /* Whether or not this is an RV515 or newer; R500s have many differences
+     * that require extra consideration, compared to their R3xx cousins:
+     * - Extra bit of width and height on texture sizes
+     * - Blend color is split across two registers
+     * - Universal Shader (US) block used for fragment shaders */
+    boolean is_r500;
+};
+
+/* Enumerations for legibility and telling which card we're running on. */
+enum {
+    CHIP_FAMILY_R300 = 0,
+    CHIP_FAMILY_R350,
+    CHIP_FAMILY_R360,
+    CHIP_FAMILY_RV350,
+    CHIP_FAMILY_RV370,
+    CHIP_FAMILY_RV380,
+    CHIP_FAMILY_R420,
+    CHIP_FAMILY_R423,
+    CHIP_FAMILY_R430,
+    CHIP_FAMILY_R480,
+    CHIP_FAMILY_R481,
+    CHIP_FAMILY_RV410,
+    CHIP_FAMILY_RS400,
+    CHIP_FAMILY_RC410,
+    CHIP_FAMILY_RS480,
+    CHIP_FAMILY_RS482,
+    CHIP_FAMILY_RS690,
+    CHIP_FAMILY_RS740,
+    CHIP_FAMILY_RV515,
+    CHIP_FAMILY_R520,
+    CHIP_FAMILY_RV530,
+    CHIP_FAMILY_R580,
+    CHIP_FAMILY_RV560,
+    CHIP_FAMILY_RV570
+};
+
+void r300_parse_chipset(struct r300_capabilities* caps);
+
+#endif /* R300_CHIPSET_H */
diff --git a/src/gallium/drivers/r300/r300_clear.c b/src/gallium/drivers/r300/r300_clear.c
new file mode 100644
index 0000000000..fd28437aaa
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_clear.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_clear.h"
+
+/* This gets its own file because Intel's is in its own file.
+ * I assume there's a good reason. */
+void r300_clear(struct pipe_context* pipe,
+                struct pipe_surface* ps,
+                unsigned color)
+{
+    pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+    ps->status = PIPE_SURFACE_STATUS_DEFINED;
+}
+\ No newline at end of file
diff --git a/src/gallium/drivers/r300/r300_clear.h b/src/gallium/drivers/r300/r300_clear.h
new file mode 100644
index 0000000000..e24a0690c9
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_clear.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "pipe/p_context.h"
+
+void r300_clear(struct pipe_context* pipe,
+                struct pipe_surface* ps,
+                unsigned color);
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
new file mode 100644
index 0000000000..7b605ae87a
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_context.h"
+
+static void r300_destroy_context(struct pipe_context* context) {
+    struct r300_context* r300 = r300_context(context);
+
+    draw_destroy(r300->draw);
+
+    FREE(r300->blend_color_state);
+    FREE(r300->scissor_state);
+    FREE(r300);
+}
+
+struct pipe_context* r300_create_context(struct pipe_screen* screen,
+                                         struct pipe_winsys* winsys,
+                                         struct r300_winsys* r300_winsys)
+{
+    struct r300_context* r300 = CALLOC_STRUCT(r300_context);
+
+    if (!r300)
+        return NULL;
+
+    r300->winsys = r300_winsys;
+    r300->context.winsys = winsys;
+    r300->context.screen = r300_create_screen(winsys, r300_winsys);
+
+    r300->context.destroy = r300_destroy_context;
+
+    r300->context.clear = r300_clear;
+
+    r300->draw = draw_create();
+    /*XXX draw_set_rasterize_stage(r300->draw, r300_draw_swtcl_stage(r300));*/
+
+    r300->blend_color_state = CALLOC_STRUCT(r300_blend_color_state);
+    r300->scissor_state = CALLOC_STRUCT(r300_scissor_state);
+
+    r300_init_flush_functions(r300);
+
+    r300_init_surface_functions(r300);
+
+    r300_init_state_functions(r300);
+
+    r300->dirty_state = R300_NEW_KITCHEN_SINK;
+    r300->dirty_hw++;
+
+    return &r300->context;
+}
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
new file mode 100644
index 0000000000..376c57639d
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_CONTEXT_H
+#define R300_CONTEXT_H
+
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_memory.h"
+
+#include "r300_clear.h"
+#include "r300_screen.h"
+#include "r300_winsys.h"
+
+struct r300_blend_state {
+    uint32_t blend_control;       /* R300_RB3D_CBLEND: 0x4e04 */
+    uint32_t alpha_blend_control; /* R300_RB3D_ABLEND: 0x4e08 */
+    uint32_t rop;                 /* R300_RB3D_ROPCNTL: 0x4e18 */
+    uint32_t dither;              /* R300_RB3D_DITHER_CTL: 0x4e50 */
+};
+
+struct r300_blend_color_state {
+    /* RV515 and earlier */
+    uint32_t blend_color;            /* R300_RB3D_BLEND_COLOR: 0x4e10 */
+    /* R520 and newer */
+    uint32_t blend_color_red_alpha;  /* R500_RB3D_CONSTANT_COLOR_AR: 0x4ef8 */
+    uint32_t blend_color_green_blue; /* R500_RB3D_CONSTANT_COLOR_GB: 0x4efc */
+};
+
+struct r300_dsa_state {
+    uint32_t alpha_function;    /* R300_FG_ALPHA_FUNC: 0x4bd4 */
+    uint32_t alpha_reference;   /* R500_FG_ALPHA_VALUE: 0x4be0 */
+    uint32_t z_buffer_control;  /* R300_ZB_CNTL: 0x4f00 */
+    uint32_t z_stencil_control; /* R300_ZB_ZSTENCILCNTL: 0x4f04 */
+    uint32_t stencil_ref_mask;  /* R300_ZB_STENCILREFMASK: 0x4f08 */
+    uint32_t z_buffer_top;      /* R300_ZB_ZTOP: 0x4f14 */
+    uint32_t stencil_ref_bf;    /* R500_ZB_STENCILREFMASK_BF: 0x4fd4 */
+};
+
+struct r300_rs_state {
+    uint32_t vap_control_status;    /* R300_VAP_CNTL_STATUS: 0x2140 */
+    uint32_t point_size;            /* R300_GA_POINT_SIZE: 0x421c */
+    uint32_t line_control;          /* R300_GA_LINE_CNTL: 0x4234 */
+    uint32_t depth_scale_front;  /* R300_SU_POLY_OFFSET_FRONT_SCALE: 0x42a4 */
+    uint32_t depth_offset_front;/* R300_SU_POLY_OFFSET_FRONT_OFFSET: 0x42a8 */
+    uint32_t depth_scale_back;    /* R300_SU_POLY_OFFSET_BACK_SCALE: 0x42ac */
+    uint32_t depth_offset_back;  /* R300_SU_POLY_OFFSET_BACK_OFFSET: 0x42b0 */
+    uint32_t polygon_offset_enable; /* R300_SU_POLY_OFFSET_ENABLE: 0x42b4 */
+    uint32_t cull_mode;             /* R300_SU_CULL_MODE: 0x42b8 */
+    uint32_t line_stipple_config;   /* R300_GA_LINE_STIPPLE_CONFIG: 0x4328 */
+    uint32_t line_stipple_value;    /* R300_GA_LINE_STIPPLE_VALUE: 0x4260 */
+};
+
+struct r300_sampler_state {
+    uint32_t filter0;      /* R300_TX_FILTER0: 0x4400 */
+    uint32_t filter1;      /* R300_TX_FILTER1: 0x4440 */
+    uint32_t border_color; /* R300_TX_BORDER_COLOR: 0x45c0 */
+};
+
+struct r300_scissor_state {
+    uint32_t scissor_top_left;     /* R300_SC_SCISSORS_TL: 0x43e0 */
+    uint32_t scissor_bottom_right; /* R300_SC_SCISSORS_BR: 0x43e4 */
+};
+
+struct r300_texture_state {
+};
+
+#define R300_NEW_BLEND           0x000001
+#define R300_NEW_BLEND_COLOR     0x000002
+#define R300_NEW_DSA             0x000004
+#define R300_NEW_FRAMEBUFFERS    0x000008
+#define R300_NEW_FRAGMENT_SHADER 0x000010
+#define R300_NEW_RASTERIZER      0x000020
+#define R300_NEW_SAMPLER         0x000040
+#define R300_NEW_SCISSOR         0x004000
+#define R300_NEW_TEXTURE         0x008000
+#define R300_NEW_VERTEX_SHADER   0x800000
+#define R300_NEW_KITCHEN_SINK    0xffffff
+
+/* The next several objects are not pure Radeon state; they inherit from
+ * various Gallium classes. */
+
+struct r3xx_fragment_shader {
+    /* Parent class */
+    struct pipe_shader_state state;
+    struct tgsi_shader_info info;
+
+    /* Has this shader been translated yet? */
+    boolean translated;
+};
+
+struct r300_fragment_shader {
+    /* Parent class */
+    struct r3xx_fragment_shader shader;
+};
+
+struct r500_fragment_shader {
+    /* Parent class */
+    struct r3xx_fragment_shader shader;
+};
+
+struct r300_texture {
+    /* Parent class */
+    struct pipe_texture tex;
+
+    /* Offsets into the buffer. */
+    unsigned offset[PIPE_MAX_TEXTURE_LEVELS];
+
+    /* Total size of this texture, in bytes. */
+    unsigned size;
+
+    /* Pipe buffer backing this texture. */
+    struct pipe_buffer* buffer;
+};
+
+struct r300_context {
+    /* Parent class */
+    struct pipe_context context;
+
+    /* The interface to the windowing system, etc. */
+    struct r300_winsys* winsys;
+    /* Draw module. Used mostly for SW TCL. */
+    struct draw_context* draw;
+
+    /* Various CSO state objects. */
+    /* Blend state. */
+    struct r300_blend_state* blend_state;
+    /* Blend color state. */
+    struct r300_blend_color_state* blend_color_state;
+    /* Depth, stencil, and alpha state. */
+    struct r300_dsa_state* dsa_state;
+    /* Fragment shader. */
+    struct r3xx_fragment_shader* fs;
+    /* Framebuffer state. We currently don't need our own version of this. */
+    struct pipe_framebuffer_state framebuffer_state;
+    /* Rasterizer state. */
+    struct r300_rs_state* rs_state;
+    /* Sampler states. */
+    struct r300_sampler_state* sampler_states[8];
+    int sampler_count;
+    /* Scissor state. */
+    struct r300_scissor_state* scissor_state;
+    /* Texture states. */
+    struct r300_texture* textures[8];
+    struct r300_texture_state* texture_states[8];
+    int texture_count;
+    /* Bitmask of dirty state objects. */
+    uint32_t dirty_state;
+    /* Flag indicating whether or not the HW is dirty. */
+    uint32_t dirty_hw;
+};
+
+/* Convenience cast wrapper. */
+static struct r300_context* r300_context(struct pipe_context* context) {
+    return (struct r300_context*)context;
+}
+
+/* Context initialization. */
+void r300_init_state_functions(struct r300_context* r300);
+void r300_init_surface_functions(struct r300_context* r300);
+
+/* Fun with includes: r300_winsys also declares this prototype.
+ * We'll just step out in that case... */
+#ifndef R300_WINSYS_H
+struct pipe_context* r300_create_context(struct pipe_screen* screen,
+                                         struct pipe_winsys* winsys,
+                                         struct r300_winsys* r300_winsys);
+#endif
+
+#endif /* R300_CONTEXT_H */
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
new file mode 100644
index 0000000000..385b61a096
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_CS_H
+#define R300_CS_H
+
+#include "r300_reg.h"
+#include "r300_winsys.h"
+
+/* Pack a 32-bit float into a dword. */
+static uint32_t pack_float_32(float f)
+{
+    union {
+        float f;
+        uint32_t u;
+    } u;
+
+    u.f = f;
+    return u.u;
+}
+
+/* Yes, I know macros are ugly. However, they are much prettier than the code
+ * that they neatly hide away, and don't have the cost of function setup,so
+ * we're going to use them. */
+
+#define MAX_CS_SIZE 64 * 1024 / 4
+
+/* XXX stolen from radeon_drm.h */
+#define RADEON_GEM_DOMAIN_CPU  0x1
+#define RADEON_GEM_DOMAIN_GTT  0x2
+#define RADEON_GEM_DOMAIN_VRAM 0x4
+
+/* XXX stolen from radeon_reg.h */
+#define RADEON_CP_PACKET0 0x0
+
+#define CP_PACKET0(register, count) \
+    (RADEON_CP_PACKET0 | ((count) << 16) | ((register) >> 2))
+
+#define CP_PACKET3(op, count) \
+    (RADEON_CP_PACKET3 | (op) | ((count) << 16))
+
+#define CS_LOCALS(context) \
+    struct r300_winsys* cs_winsys = context->winsys; \
+    struct radeon_cs* cs = cs_winsys->cs; \
+    int cs_count = 0;
+
+#define CHECK_CS(size) \
+    cs_winsys->check_cs(cs, (size))
+
+#define BEGIN_CS(size) do { \
+    CHECK_CS(size); \
+    debug_printf("r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
+        size, __FUNCTION__, __FILE__, __LINE__); \
+    cs_winsys->begin_cs(cs, (size), __FILE__, __FUNCTION__, __LINE__); \
+    cs_count = size; \
+} while (0)
+
+#define OUT_CS(value) do { \
+    cs_winsys->write_cs_dword(cs, (value)); \
+    cs_count--; \
+} while (0)
+
+#define OUT_CS_32F(value) do { \
+    cs_winsys->write_cs_dword(cs, pack_float_32(value)); \
+    cs_count--; \
+} while (0)
+
+#define OUT_CS_REG(register, value) do { \
+    debug_printf("r300: writing 0x%08X to register 0x%04X\n", \
+        value, register); \
+    assert(register); \
+    OUT_CS(CP_PACKET0(register, 0)); \
+    OUT_CS(value); \
+} while (0)
+
+/* Note: This expects count to be the number of registers,
+ * not the actual packet0 count! */
+#define OUT_CS_REG_SEQ(register, count) do { \
+    debug_printf("r300: writing register sequence of %d to 0x%04X\n", \
+        count, register); \
+    assert(register); \
+    OUT_CS(CP_PACKET0(register, ((count) - 1))); \
+} while (0)
+
+#define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
+    debug_printf("r300: writing relocation for buffer %p, offset %d\n", \
+        bo, offset); \
+    assert(bo); \
+    OUT_CS(offset); \
+    cs_winsys->write_cs_reloc(cs, bo, rd, wd, flags); \
+    cs_count -= 2; \
+} while (0)
+
+#define END_CS do { \
+    debug_printf("r300: END_CS in %s (%s:%d)\n", __FUNCTION__, __FILE__, \
+        __LINE__); \
+    if (cs_count != 0) \
+        debug_printf("r300: Warning: cs_count off by %d\n", cs_count); \
+    cs_winsys->end_cs(cs, __FILE__, __FUNCTION__, __LINE__); \
+} while (0)
+
+#define FLUSH_CS do { \
+    debug_printf("r300: FLUSH_CS in %s (%s:%d)\n", __FUNCTION__, __FILE__, \
+        __LINE__); \
+    cs_winsys->flush_cs(cs); \
+} while (0)
+
+#include "r300_cs_inlines.h"
+
+#endif /* R300_CS_H */
diff --git a/src/gallium/drivers/r300/r300_cs_inlines.h b/src/gallium/drivers/r300/r300_cs_inlines.h
new file mode 100644
index 0000000000..71e6623699
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_cs_inlines.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* r300_cs_inlines: This is just a handful of useful inlines for sending
+ * (very) common instructions to the CS buffer. Should only be included from
+ * r300_cs.h, probably. */
+
+#ifdef R300_CS_H
+
+#define R300_PACIFY do { \
+    OUT_CS_REG(R300_SC_SCREENDOOR, 0x0); \
+    OUT_CS_REG(RADEON_WAIT_UNTIL, (1 << 15) | (1 << 17) | \
+        (1 << 18) | (1 << 31)); \
+    OUT_CS_REG(R300_SC_SCREENDOOR, 0xffffff); \
+} while (0)
+
+
+#endif /* R300_CS_H */
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
new file mode 100644
index 0000000000..585a9e729d
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* r300_emit: Functions for emitting state. */
+
+#include "r300_emit.h"
+
+void r300_emit_blend_state(struct r300_context* r300,
+                           struct r300_blend_state* blend)
+{
+    CS_LOCALS(r300);
+    BEGIN_CS(7);
+    OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 2);
+    OUT_CS(blend->blend_control);
+    OUT_CS(blend->alpha_blend_control);
+    OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
+    OUT_CS_REG(R300_RB3D_DITHER_CTL, blend->dither);
+    END_CS;
+}
+
+void r300_emit_blend_color_state(struct r300_context* r300,
+                                 struct r300_blend_color_state* bc)
+{
+    struct r300_screen* r300screen =
+        (struct r300_screen*)r300->context.screen;
+    CS_LOCALS(r300);
+    if (r300screen->caps->is_r500) {
+        BEGIN_CS(3);
+        OUT_CS_REG_SEQ(R500_RB3D_CONSTANT_COLOR_AR, 2);
+        OUT_CS(bc->blend_color_red_alpha);
+        OUT_CS(bc->blend_color_green_blue);
+        END_CS;
+    } else {
+        BEGIN_CS(2);
+        OUT_CS_REG(R300_RB3D_BLEND_COLOR, bc->blend_color);
+        END_CS;
+    }
+}
+
+void r300_emit_dsa_state(struct r300_context* r300,
+                           struct r300_dsa_state* dsa)
+{
+    struct r300_screen* r300screen =
+        (struct r300_screen*)r300->context.screen;
+    CS_LOCALS(r300);
+    BEGIN_CS(r300screen->caps->is_r500 ? 8 : 8);
+    OUT_CS_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
+    /* XXX figure out the r300 counterpart for this */
+    if (r300screen->caps->is_r500) {
+        /* OUT_CS_REG(R500_FG_ALPHA_VALUE, dsa->alpha_reference); */
+    }
+    OUT_CS_REG_SEQ(R300_ZB_CNTL, 3);
+    OUT_CS(dsa->z_buffer_control);
+    OUT_CS(dsa->z_stencil_control);
+    OUT_CS(dsa->stencil_ref_mask);
+    OUT_CS_REG(R300_ZB_ZTOP, dsa->z_buffer_top);
+    if (r300screen->caps->is_r500) {
+        /* OUT_CS_REG(R500_ZB_STENCILREFMASK_BF, dsa->stencil_ref_bf); */
+    }
+    END_CS;
+}
+
+/* XXX add pitch, stride, z/stencil buf */
+void r300_emit_fb_state(struct r300_context* r300,
+                        struct pipe_framebuffer_state* fb)
+{
+    CS_LOCALS(r300);
+    struct r300_texture* tex;
+    int i;
+
+    BEGIN_CS((3 * fb->nr_cbufs) + 6);
+    for (i = 0; i < fb->nr_cbufs; i++) {
+        tex = (struct r300_texture*)fb->cbufs[i]->texture;
+        OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0 + (4 * i), 1);
+        OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+    }
+    R300_PACIFY;
+    END_CS;
+}
+
+void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs)
+{
+    struct r300_screen* r300screen =
+        (struct r300_screen*)r300->context.screen;
+    CS_LOCALS(r300);
+    BEGIN_CS(14);
+    OUT_CS_REG(R300_VAP_CNTL_STATUS, rs->vap_control_status);
+    OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_FRONT_SCALE, 6);
+    OUT_CS(rs->depth_scale_front);
+    OUT_CS(rs->depth_offset_front);
+    OUT_CS(rs->depth_scale_back);
+    OUT_CS(rs->depth_offset_back);
+    OUT_CS(rs->polygon_offset_enable);
+    OUT_CS(rs->cull_mode);
+    OUT_CS_REG(R300_GA_LINE_STIPPLE_CONFIG, rs->line_stipple_config);
+    OUT_CS_REG(R300_GA_LINE_STIPPLE_VALUE, rs->line_stipple_value);
+    END_CS;
+}
+
+static void r300_emit_dirty_state(struct r300_context* r300)
+{
+    struct r300_screen* r300screen =
+        (struct r300_screen*)r300->context.screen;
+    CS_LOCALS(r300);
+
+    if (!(r300->dirty_state) && !(r300->dirty_hw)) {
+        return;
+    }
+
+    /* XXX check size */
+
+    if (r300->dirty_state & R300_NEW_BLEND) {
+        r300_emit_blend_state(r300, r300->blend_state);
+    }
+
+    if (r300->dirty_state & R300_NEW_BLEND_COLOR) {
+        r300_emit_blend_color_state(r300, r300->blend_color_state);
+    }
+
+    if (r300->dirty_state & R300_NEW_DSA) {
+        r300_emit_dsa_state(r300, r300->dsa_state);
+    }
+
+    if (r300->dirty_state & R300_NEW_RASTERIZER) {
+        r300_emit_rs_state(r300, r300->rs_state);
+    }
+
+    if (r300->dirty_state & R300_NEW_SCISSOR) {
+        struct r300_scissor_state* scissor = r300->scissor_state;
+        /* XXX next two are contiguous regs */
+        OUT_CS_REG(R300_SC_SCISSORS_TL, scissor->scissor_top_left);
+        OUT_CS_REG(R300_SC_SCISSORS_BR, scissor->scissor_bottom_right);
+    }
+
+    r300->dirty_state = 0;
+}
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
new file mode 100644
index 0000000000..b6e69386f9
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_screen.h"
+
+void r300_emit_blend_state(struct r300_context* r300,
+                           struct r300_blend_state* blend);
+
+void r300_emit_blend_color_state(struct r300_context* r300,
+                                 struct r300_blend_color_state* bc);
+
+void r300_emit_dsa_state(struct r300_context* r300,
+                         struct r300_dsa_state* dsa);
+
+void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs);
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
new file mode 100644
index 0000000000..3766f0a0a7
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_flush.h"
+
+static void r300_flush(struct pipe_context* pipe,
+                       unsigned flags,
+                       struct pipe_fence_handle** fence)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    CS_LOCALS(r300);
+
+    if (r300->dirty_hw) {
+        FLUSH_CS;
+        r300->dirty_state = R300_NEW_KITCHEN_SINK;
+        r300->dirty_hw = 0;
+    }
+}
+
+void r300_init_flush_functions(struct r300_context* r300)
+{
+    r300->context.flush = r300_flush;
+}
diff --git a/src/gallium/drivers/r300/r300_flush.h b/src/gallium/drivers/r300/r300_flush.h
new file mode 100644
index 0000000000..a1b224b39c
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_flush.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_FLUSH_H
+#define R300_FLUSH_H
+
+#include "pipe/p_context.h"
+
+#include "r300_context.h"
+#include "r300_cs.h"
+
+void r300_init_flush_functions(struct r300_context* r300);
+
+#endif /* R300_FLUSH_H */
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
new file mode 100644
index 0000000000..dbd0cc28e2
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -0,0 +1,3290 @@
+/**************************************************************************
+
+Copyright (C) 2004-2005 Nicolai Haehnle et al.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/* *INDENT-OFF* */
+
+#ifndef _R300_REG_H
+#define _R300_REG_H
+
+#define R300_MC_INIT_MISC_LAT_TIMER	0x180
+#	define R300_MC_MISC__MC_CPR_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_VF_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_DISP0R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_DISP1R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_FIXED_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_E2R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_SAME_PAGE_PRIO_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_INIT_LAT_SHIFT	28
+
+
+#define R300_MC_INIT_GFX_LAT_TIMER	0x154
+#	define R300_MC_MISC__MC_G3D0R_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_G3D1R_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_G3D2R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_G3D3R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_TX0R_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_TX1R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_GLOBR_INIT_LAT_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_FULL_LAT_SHIFT	28
+
+/*
+ * This file contains registers and constants for the R300. They have been
+ * found mostly by examining command buffers captured using glxtest, as well
+ * as by extrapolating some known registers and constants from the R200.
+ * I am fairly certain that they are correct unless stated otherwise
+ * in comments.
+ */
+
+#define R300_SE_VPORT_XSCALE                0x1D98
+#define R300_SE_VPORT_XOFFSET               0x1D9C
+#define R300_SE_VPORT_YSCALE                0x1DA0
+#define R300_SE_VPORT_YOFFSET               0x1DA4
+#define R300_SE_VPORT_ZSCALE                0x1DA8
+#define R300_SE_VPORT_ZOFFSET               0x1DAC
+
+
+/*
+ * Vertex Array Processing (VAP) Control
+ */
+#define R300_VAP_CNTL	0x2080
+#       define R300_PVS_NUM_SLOTS_SHIFT                 0
+#       define R300_PVS_NUM_CNTLRS_SHIFT                4
+#       define R300_PVS_NUM_FPUS_SHIFT                  8
+#       define R300_VF_MAX_VTX_NUM_SHIFT                18
+#       define R300_GL_CLIP_SPACE_DEF                   (0 << 22)
+#       define R300_DX_CLIP_SPACE_DEF                   (1 << 22)
+#       define R500_TCL_STATE_OPTIMIZATION              (1 << 23)
+
+/* This register is written directly and also starts data section
+ * in many 3d CP_PACKET3's
+ */
+#define R300_VAP_VF_CNTL	0x2084
+#	define	R300_VAP_VF_CNTL__PRIM_TYPE__SHIFT              0
+#	define  R300_VAP_VF_CNTL__PRIM_NONE                     (0<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POINTS                   (1<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINES                    (2<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_STRIP               (3<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLES                (4<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN             (5<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP           (6<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_LOOP                (12<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUADS                    (13<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUAD_STRIP               (14<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POLYGON                  (15<<0)
+
+#	define	R300_VAP_VF_CNTL__PRIM_WALK__SHIFT              4
+	/* State based - direct writes to registers trigger vertex
+           generation */
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_STATE_BASED         (0<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_INDICES             (1<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST         (2<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED     (3<<4)
+
+	/* I don't think I saw these three used.. */
+#	define	R300_VAP_VF_CNTL__COLOR_ORDER__SHIFT            6
+#	define	R300_VAP_VF_CNTL__TCL_OUTPUT_CTL_ENA__SHIFT     9
+#	define	R300_VAP_VF_CNTL__PROG_STREAM_ENA__SHIFT        10
+
+	/* index size - when not set the indices are assumed to be 16 bit */
+#	define	R300_VAP_VF_CNTL__INDEX_SIZE_32bit              (1<<11)
+	/* number of vertices */
+#	define	R300_VAP_VF_CNTL__NUM_VERTICES__SHIFT           16
+
+#define R500_VAP_INDEX_OFFSET		    0x208c
+
+#define R300_VAP_OUTPUT_VTX_FMT_0           0x2090
+#       define R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT     (1<<0)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT (1<<1)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT (1<<2)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT (1<<3)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT (1<<4)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT (1<<16)
+
+#define R300_VAP_OUTPUT_VTX_FMT_1           0x2094
+	/* each of the following is 3 bits wide, specifies number
+	   of components */
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT 0
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT 3
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT 6
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT 9
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT 12
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT 15
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT 18
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT 21
+#	define R300_VAP_OUTPUT_VTX_FMT_1__NOT_PRESENT  0
+#	define R300_VAP_OUTPUT_VTX_FMT_1__1_COMPONENT  1
+#	define R300_VAP_OUTPUT_VTX_FMT_1__2_COMPONENTS 2
+#	define R300_VAP_OUTPUT_VTX_FMT_1__3_COMPONENTS 3
+#	define R300_VAP_OUTPUT_VTX_FMT_1__4_COMPONENTS 4
+
+#define R300_SE_VTE_CNTL                  0x20b0
+#	define     R300_VPORT_X_SCALE_ENA                (1 << 0)
+#	define     R300_VPORT_X_OFFSET_ENA               (1 << 1)
+#	define     R300_VPORT_Y_SCALE_ENA                (1 << 2)
+#	define     R300_VPORT_Y_OFFSET_ENA               (1 << 3)
+#	define     R300_VPORT_Z_SCALE_ENA                (1 << 4)
+#	define     R300_VPORT_Z_OFFSET_ENA               (1 << 5)
+#	define     R300_VTX_XY_FMT                       (1 << 8)
+#	define     R300_VTX_Z_FMT                        (1 << 9)
+#	define     R300_VTX_W0_FMT                       (1 << 10)
+#	define     R300_SERIAL_PROC_ENA                  (1 << 11)
+
+#define R300_VAP_VTX_SIZE               0x20b4
+
+/* BEGIN: Vertex data assembly - lots of uncertainties */
+
+/* gap */
+
+/* Maximum Vertex Indx Clamp */
+#define R300_VAP_VF_MAX_VTX_INDX         0x2134
+/* Minimum Vertex Indx Clamp */
+#define R300_VAP_VF_MIN_VTX_INDX         0x2138
+
+/** Vertex assembler/processor control status */
+#define R300_VAP_CNTL_STATUS              0x2140
+/* No swap at all (default) */
+#	define R300_VC_NO_SWAP                  (0 << 0)
+/* 16-bit swap: 0xAABBCCDD becomes 0xBBAADDCC */
+#	define R300_VC_16BIT_SWAP               (1 << 0)
+/* 32-bit swap: 0xAABBCCDD becomes 0xDDCCBBAA */
+#	define R300_VC_32BIT_SWAP               (2 << 0)
+/* Half-dword swap: 0xAABBCCDD becomes 0xCCDDAABB */
+#	define R300_VC_HALF_DWORD_SWAP          (3 << 0)
+/* The TCL engine will not be used (as it is logically or even physically removed) */
+#	define R300_VAP_TCL_BYPASS		(1 << 8)
+/* Read only flag if TCL engine is busy. */
+#	define R300_VAP_PVS_BUSY                (1 << 11)
+/* TODO: gap for MAX_MPS */
+/* Read only flag if the vertex store is busy. */
+#	define R300_VAP_VS_BUSY                 (1 << 24)
+/* Read only flag if the reciprocal engine is busy. */
+#	define R300_VAP_RCP_BUSY                (1 << 25)
+/* Read only flag if the viewport transform engine is busy. */
+#	define R300_VAP_VTE_BUSY                (1 << 26)
+/* Read only flag if the memory interface unit is busy. */
+#	define R300_VAP_MUI_BUSY                (1 << 27)
+/* Read only flag if the vertex cache is busy. */
+#	define R300_VAP_VC_BUSY                 (1 << 28)
+/* Read only flag if the vertex fetcher is busy. */
+#	define R300_VAP_VF_BUSY                 (1 << 29)
+/* Read only flag if the register pipeline is busy. */
+#	define R300_VAP_REGPIPE_BUSY            (1 << 30)
+/* Read only flag if the VAP engine is busy. */
+#	define R300_VAP_VAP_BUSY                (1 << 31)
+
+/* gap */
+
+/* Where do we get our vertex data?
+ *
+ * Vertex data either comes either from immediate mode registers or from
+ * vertex arrays.
+ * There appears to be no mixed mode (though we can force the pitch of
+ * vertex arrays to 0, effectively reusing the same element over and over
+ * again).
+ *
+ * Immediate mode is controlled by the INPUT_CNTL registers. I am not sure
+ * if these registers influence vertex array processing.
+ *
+ * Vertex arrays are controlled via the 3D_LOAD_VBPNTR packet3.
+ *
+ * In both cases, vertex attributes are then passed through INPUT_ROUTE.
+ *
+ * Beginning with INPUT_ROUTE_0_0 is a list of WORDs that route vertex data
+ * into the vertex processor's input registers.
+ * The first word routes the first input, the second word the second, etc.
+ * The corresponding input is routed into the register with the given index.
+ * The list is ended by a word with INPUT_ROUTE_END set.
+ *
+ * Always set COMPONENTS_4 in immediate mode.
+ */
+
+#define R300_VAP_PROG_STREAM_CNTL_0                     0x2150
+#       define R300_DATA_TYPE_0_SHIFT                   0
+#       define R300_DATA_TYPE_FLOAT_1                   0
+#       define R300_DATA_TYPE_FLOAT_2                   1
+#       define R300_DATA_TYPE_FLOAT_3                   2
+#       define R300_DATA_TYPE_FLOAT_4                   3
+#       define R300_DATA_TYPE_BYTE                      4
+#       define R300_DATA_TYPE_D3DCOLOR                  5
+#       define R300_DATA_TYPE_SHORT_2                   6
+#       define R300_DATA_TYPE_SHORT_4                   7
+#       define R300_DATA_TYPE_VECTOR_3_TTT              8
+#       define R300_DATA_TYPE_VECTOR_3_EET              9
+#       define R300_SKIP_DWORDS_SHIFT                   4
+#       define R300_DST_VEC_LOC_SHIFT                   8
+#       define R300_LAST_VEC                            (1 << 13)
+#       define R300_SIGNED                              (1 << 14)
+#       define R300_NORMALIZE                           (1 << 15)
+#       define R300_DATA_TYPE_1_SHIFT                   16
+#define R300_VAP_PROG_STREAM_CNTL_1                     0x2154
+#define R300_VAP_PROG_STREAM_CNTL_2                     0x2158
+#define R300_VAP_PROG_STREAM_CNTL_3                     0x215C
+#define R300_VAP_PROG_STREAM_CNTL_4                     0x2160
+#define R300_VAP_PROG_STREAM_CNTL_5                     0x2164
+#define R300_VAP_PROG_STREAM_CNTL_6                     0x2168
+#define R300_VAP_PROG_STREAM_CNTL_7                     0x216C
+/* gap */
+
+/* Notes:
+ *  - always set up to produce at least two attributes:
+ *    if vertex program uses only position, fglrx will set normal, too
+ *  - INPUT_CNTL_0_COLOR and INPUT_CNTL_COLOR bits are always equal.
+ */
+#define R300_VAP_VTX_STATE_CNTL               0x2180
+#       define R300_COLOR_0_ASSEMBLY_SHIFT    0
+#       define R300_SEL_COLOR                 0
+#       define R300_SEL_USER_COLOR_0          1
+#       define R300_SEL_USER_COLOR_1          2
+#       define R300_COLOR_1_ASSEMBLY_SHIFT    2
+#       define R300_COLOR_2_ASSEMBLY_SHIFT    4
+#       define R300_COLOR_3_ASSEMBLY_SHIFT    6
+#       define R300_COLOR_4_ASSEMBLY_SHIFT    8
+#       define R300_COLOR_5_ASSEMBLY_SHIFT    10
+#       define R300_COLOR_6_ASSEMBLY_SHIFT    12
+#       define R300_COLOR_7_ASSEMBLY_SHIFT    14
+#       define R300_UPDATE_USER_COLOR_0_ENA   (1 << 16)
+
+/*
+ * Each bit in this field applies to the corresponding vector in the VSM
+ * memory (i.e. Bit 0 applies to VECTOR_0 (POSITION), etc.). If the bit
+ * is set, then the corresponding 4-Dword Vector is output into the Vertex Stream.
+ */
+#define R300_VAP_VSM_VTX_ASSM               0x2184
+#       define R300_INPUT_CNTL_POS               0x00000001
+#       define R300_INPUT_CNTL_NORMAL            0x00000002
+#       define R300_INPUT_CNTL_COLOR             0x00000004
+#       define R300_INPUT_CNTL_TC0               0x00000400
+#       define R300_INPUT_CNTL_TC1               0x00000800
+#       define R300_INPUT_CNTL_TC2               0x00001000 /* GUESS */
+#       define R300_INPUT_CNTL_TC3               0x00002000 /* GUESS */
+#       define R300_INPUT_CNTL_TC4               0x00004000 /* GUESS */
+#       define R300_INPUT_CNTL_TC5               0x00008000 /* GUESS */
+#       define R300_INPUT_CNTL_TC6               0x00010000 /* GUESS */
+#       define R300_INPUT_CNTL_TC7               0x00020000 /* GUESS */
+
+/* Programmable Stream Control Signed Normalize Control */
+#define R300_VAP_PSC_SGN_NORM_CNTL         0x21dc
+#	define SGN_NORM_ZERO                 0
+#	define SGN_NORM_ZERO_CLAMP_MINUS_ONE 1
+#	define SGN_NORM_NO_ZERO              2
+
+/* gap */
+
+/* Words parallel to INPUT_ROUTE_0; All words that are active in INPUT_ROUTE_0
+ * are set to a swizzling bit pattern, other words are 0.
+ *
+ * In immediate mode, the pattern is always set to xyzw. In vertex array
+ * mode, the swizzling pattern is e.g. used to set zw components in texture
+ * coordinates with only tweo components.
+ */
+#define R300_VAP_PROG_STREAM_CNTL_EXT_0                 0x21e0
+#       define R300_SWIZZLE0_SHIFT                      0
+#       define R300_SWIZZLE_SELECT_X_SHIFT              0
+#       define R300_SWIZZLE_SELECT_Y_SHIFT              3
+#       define R300_SWIZZLE_SELECT_Z_SHIFT              6
+#       define R300_SWIZZLE_SELECT_W_SHIFT              9
+
+#       define R300_SWIZZLE_SELECT_X                    0
+#       define R300_SWIZZLE_SELECT_Y                    1
+#       define R300_SWIZZLE_SELECT_Z                    2
+#       define R300_SWIZZLE_SELECT_W                    3
+#       define R300_SWIZZLE_SELECT_FP_ZERO              4
+#       define R300_SWIZZLE_SELECT_FP_ONE               5
+/* alternate forms for r300_emit.c */
+#       define R300_INPUT_ROUTE_SELECT_X    0
+#       define R300_INPUT_ROUTE_SELECT_Y    1
+#       define R300_INPUT_ROUTE_SELECT_Z    2
+#       define R300_INPUT_ROUTE_SELECT_W    3
+#       define R300_INPUT_ROUTE_SELECT_ZERO 4
+#       define R300_INPUT_ROUTE_SELECT_ONE  5
+
+#       define R300_WRITE_ENA_SHIFT                     12
+#       define R300_WRITE_ENA_X                         1
+#       define R300_WRITE_ENA_Y                         2
+#       define R300_WRITE_ENA_Z                         4
+#       define R300_WRITE_ENA_W                         8
+#       define R300_SWIZZLE1_SHIFT                      16
+#define R300_VAP_PROG_STREAM_CNTL_EXT_1                 0x21e4
+#define R300_VAP_PROG_STREAM_CNTL_EXT_2                 0x21e8
+#define R300_VAP_PROG_STREAM_CNTL_EXT_3                 0x21ec
+#define R300_VAP_PROG_STREAM_CNTL_EXT_4                 0x21f0
+#define R300_VAP_PROG_STREAM_CNTL_EXT_5                 0x21f4
+#define R300_VAP_PROG_STREAM_CNTL_EXT_6                 0x21f8
+#define R300_VAP_PROG_STREAM_CNTL_EXT_7                 0x21fc
+
+/* END: Vertex data assembly */
+
+/* gap */
+
+/* BEGIN: Upload vertex program and data */
+
+/*
+ * The programmable vertex shader unit has a memory bank of unknown size
+ * that can be written to in 16 byte units by writing the address into
+ * UPLOAD_ADDRESS, followed by data in UPLOAD_DATA (multiples of 4 DWORDs).
+ *
+ * Pointers into the memory bank are always in multiples of 16 bytes.
+ *
+ * The memory bank is divided into areas with fixed meaning.
+ *
+ * Starting at address UPLOAD_PROGRAM: Vertex program instructions.
+ * Native limits reported by drivers from ATI suggest size 256 (i.e. 4KB),
+ * whereas the difference between known addresses suggests size 512.
+ *
+ * Starting at address UPLOAD_PARAMETERS: Vertex program parameters.
+ * Native reported limits and the VPI layout suggest size 256, whereas
+ * difference between known addresses suggests size 512.
+ *
+ * At address UPLOAD_POINTSIZE is a vector (0, 0, ps, 0), where ps is the
+ * floating point pointsize. The exact purpose of this state is uncertain,
+ * as there is also the R300_RE_POINTSIZE register.
+ *
+ * Multiple vertex programs and parameter sets can be loaded at once,
+ * which could explain the size discrepancy.
+ */
+#define R300_VAP_PVS_VECTOR_INDX_REG         0x2200
+#       define R300_PVS_CODE_START           0
+#       define R300_MAX_PVS_CODE_LINES       256
+#       define R500_MAX_PVS_CODE_LINES       1024
+#       define R300_PVS_CONST_START          512
+#       define R500_PVS_CONST_START          1024
+#       define R300_MAX_PVS_CONST_VECS       256
+#       define R500_MAX_PVS_CONST_VECS       1024
+#       define R300_PVS_UCP_START            1024
+#       define R500_PVS_UCP_START            1536
+#       define R300_POINT_VPORT_SCALE_OFFSET 1030
+#       define R500_POINT_VPORT_SCALE_OFFSET 1542
+#       define R300_POINT_GEN_TEX_OFFSET     1031
+#       define R500_POINT_GEN_TEX_OFFSET     1543
+
+/*
+ * These are obsolete defines form r300_context.h, but they might give some
+ * clues when investigating the addresses further...
+ */
+#if 0
+#define VSF_DEST_PROGRAM        0x0
+#define VSF_DEST_MATRIX0        0x200
+#define VSF_DEST_MATRIX1        0x204
+#define VSF_DEST_MATRIX2        0x208
+#define VSF_DEST_VECTOR0        0x20c
+#define VSF_DEST_VECTOR1        0x20d
+#define VSF_DEST_UNKNOWN1       0x400
+#define VSF_DEST_UNKNOWN2       0x406
+#endif
+
+/* gap */
+
+#define R300_VAP_PVS_UPLOAD_DATA            0x2208
+
+/* END: Upload vertex program and data */
+
+/* gap */
+
+/* I do not know the purpose of this register. However, I do know that
+ * it is set to 221C_CLEAR for clear operations and to 221C_NORMAL
+ * for normal rendering.
+ *
+ * 2007-11-05: This register is the user clip plane control register, but there
+ * also seems to be a rendering mode control; the NORMAL/CLEAR defines.
+ *
+ * See bug #9871. http://bugs.freedesktop.org/attachment.cgi?id=10672&action=view
+ */
+#define R300_VAP_CLIP_CNTL                       0x221C
+#       define R300_VAP_UCP_ENABLE_0             (1 << 0)
+#       define R300_VAP_UCP_ENABLE_1             (1 << 1)
+#       define R300_VAP_UCP_ENABLE_2             (1 << 2)
+#       define R300_VAP_UCP_ENABLE_3             (1 << 3)
+#       define R300_VAP_UCP_ENABLE_4             (1 << 4)
+#       define R300_VAP_UCP_ENABLE_5             (1 << 5)
+#       define R300_PS_UCP_MODE_DIST_COP         (0 << 14)
+#       define R300_PS_UCP_MODE_RADIUS_COP       (1 << 14)
+#       define R300_PS_UCP_MODE_RADIUS_COP_CLIP  (2 << 14)
+#       define R300_PS_UCP_MODE_CLIP_AS_TRIFAN   (3 << 14)
+#       define R300_CLIP_DISABLE                 (1 << 16)
+#       define R300_UCP_CULL_ONLY_ENABLE         (1 << 17)
+#       define R300_BOUNDARY_EDGE_FLAG_ENABLE    (1 << 18)
+#       define R500_COLOR2_IS_TEXTURE            (1 << 20)
+#       define R500_COLOR3_IS_TEXTURE            (1 << 21)
+
+/* These seem to be per-pixel and per-vertex X and Y clipping planes. The first
+ * plane is per-pixel and the second plane is per-vertex.
+ *
+ * This was determined by experimentation alone but I believe it is correct.
+ *
+ * These registers are called X_QUAD0_1_FL to X_QUAD0_4_FL by glxtest.
+ */
+#define R300_VAP_GB_VERT_CLIP_ADJ                   0x2220
+#define R300_VAP_GB_VERT_DISC_ADJ                   0x2224
+#define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
+#define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
+
+/* gap */
+
+/* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
+ * rendering commands and overwriting vertex program parameters.
+ * Therefore, I suspect writing zero to 0x2284 synchronizes the engine and
+ * avoids bugs caused by still running shaders reading bad data from memory.
+ */
+#define R300_VAP_PVS_STATE_FLUSH_REG        0x2284
+
+/* This register is used to define the number of core clocks to wait for a
+ * vertex to be received by the VAP input controller (while the primitive
+ * path is backed up) before forcing any accumulated vertices to be submitted
+ * to the vertex processing path.
+ */
+#define VAP_PVS_VTX_TIMEOUT_REG             0x2288
+#       define R300_2288_R300                    0x00750000 /* -- nh */
+#       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
+
+/* gap */
+
+/* Addresses are relative to the vertex program instruction area of the
+ * memory bank. PROGRAM_END points to the last instruction of the active
+ * program
+ *
+ * The meaning of the two UNKNOWN fields is obviously not known. However,
+ * experiments so far have shown that both *must* point to an instruction
+ * inside the vertex program, otherwise the GPU locks up.
+ *
+ * fglrx usually sets CNTL_3_UNKNOWN to the end of the program and
+ * R300_PVS_CNTL_1_POS_END_SHIFT points to instruction where last write to
+ * position takes place.
+ *
+ * Most likely this is used to ignore rest of the program in cases
+ * where group of verts arent visible. For some reason this "section"
+ * is sometimes accepted other instruction that have no relationship with
+ * position calculations.
+ */
+#define R300_VAP_PVS_CODE_CNTL_0            0x22D0
+#       define R300_PVS_FIRST_INST_SHIFT         0
+#       define R300_PVS_XYZW_VALID_INST_SHIFT    10
+#       define R300_PVS_LAST_INST_SHIFT          20
+/* Addresses are relative the the vertex program parameters area. */
+#define R300_VAP_PVS_CONST_CNTL             0x22D4
+#       define R300_PVS_CONST_BASE_OFFSET_SHIFT  0
+#       define R300_PVS_MAX_CONST_ADDR_SHIFT     16
+#define R300_VAP_PVS_CODE_CNTL_1	    0x22D8
+#       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
+#define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+
+/* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
+ * immediate vertices
+ */
+#define R300_VAP_VTX_COLOR_R                0x2464
+#define R300_VAP_VTX_COLOR_G                0x2468
+#define R300_VAP_VTX_COLOR_B                0x246C
+#define R300_VAP_VTX_POS_0_X_1              0x2490 /* used for glVertex2*() */
+#define R300_VAP_VTX_POS_0_Y_1              0x2494
+#define R300_VAP_VTX_COLOR_PKD              0x249C /* RGBA */
+#define R300_VAP_VTX_POS_0_X_2              0x24A0 /* used for glVertex3*() */
+#define R300_VAP_VTX_POS_0_Y_2              0x24A4
+#define R300_VAP_VTX_POS_0_Z_2              0x24A8
+/* write 0 to indicate end of packet? */
+#define R300_VAP_VTX_END_OF_PKT             0x24AC
+
+/* gap */
+
+/* These are values from r300_reg/r300_reg.h - they are known to be correct
+ * and are here so we can use one register file instead of several
+ * - Vladimir
+ */
+#define R300_GB_VAP_RASTER_VTX_FMT_0	0x4000
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__POS_PRESENT	(1<<0)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_0_PRESENT	(1<<1)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_1_PRESENT	(1<<2)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_2_PRESENT	(1<<3)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_3_PRESENT	(1<<4)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_SPACE	(0xf<<5)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__PT_SIZE_PRESENT	(0x1<<16)
+
+#define R300_GB_VAP_RASTER_VTX_FMT_1	0x4004
+	/* each of the following is 3 bits wide, specifies number
+	   of components */
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT	0
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT	3
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT	6
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT	9
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT	12
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT	15
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT	18
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT	21
+
+/* UNK30 seems to enables point to quad transformation on textures
+ * (or something closely related to that).
+ * This bit is rather fatal at the time being due to lackings at pixel
+ * shader side
+ * Specifies top of Raster pipe specific enable controls.
+ */
+#define R300_GB_ENABLE	0x4008
+#	define R300_GB_POINT_STUFF_DISABLE     (0 << 0)
+#	define R300_GB_POINT_STUFF_ENABLE      (1 << 0) /* Specifies if points will have stuffed texture coordinates. */
+#	define R300_GB_LINE_STUFF_DISABLE      (0 << 1)
+#	define R300_GB_LINE_STUFF_ENABLE       (1 << 1) /* Specifies if lines will have stuffed texture coordinates. */
+#	define R300_GB_TRIANGLE_STUFF_DISABLE  (0 << 2)
+#	define R300_GB_TRIANGLE_STUFF_ENABLE   (1 << 2) /* Specifies if triangles will have stuffed texture coordinates. */
+#	define R300_GB_STENCIL_AUTO_DISABLE    (0 << 4)
+#	define R300_GB_STENCIL_AUTO_ENABLE     (1 << 4) /* Enable stencil auto inc/dec based on triangle cw/ccw, force into dzy low bit. */
+#	define R300_GB_STENCIL_AUTO_FORCE      (2 << 4) /* Force 0 into dzy low bit. */
+
+	/* each of the following is 2 bits wide */
+#define R300_GB_TEX_REPLICATE	0 /* Replicate VAP source texture coordinates (S,T,[R,Q]). */
+#define R300_GB_TEX_ST		1 /* Stuff with source texture coordinates (S,T). */
+#define R300_GB_TEX_STR		2 /* Stuff with source texture coordinates (S,T,R). */
+#	define R300_GB_TEX0_SOURCE_SHIFT	16
+#	define R300_GB_TEX1_SOURCE_SHIFT	18
+#	define R300_GB_TEX2_SOURCE_SHIFT	20
+#	define R300_GB_TEX3_SOURCE_SHIFT	22
+#	define R300_GB_TEX4_SOURCE_SHIFT	24
+#	define R300_GB_TEX5_SOURCE_SHIFT	26
+#	define R300_GB_TEX6_SOURCE_SHIFT	28
+#	define R300_GB_TEX7_SOURCE_SHIFT	30
+
+/* MSPOS - positions for multisample antialiasing (?) */
+#define R300_GB_MSPOS0                           0x4010
+	/* shifts - each of the fields is 4 bits */
+#	define R300_GB_MSPOS0__MS_X0_SHIFT	0
+#	define R300_GB_MSPOS0__MS_Y0_SHIFT	4
+#	define R300_GB_MSPOS0__MS_X1_SHIFT	8
+#	define R300_GB_MSPOS0__MS_Y1_SHIFT	12
+#	define R300_GB_MSPOS0__MS_X2_SHIFT	16
+#	define R300_GB_MSPOS0__MS_Y2_SHIFT	20
+#	define R300_GB_MSPOS0__MSBD0_Y		24
+#	define R300_GB_MSPOS0__MSBD0_X		28
+
+#define R300_GB_MSPOS1                           0x4014
+#	define R300_GB_MSPOS1__MS_X3_SHIFT	0
+#	define R300_GB_MSPOS1__MS_Y3_SHIFT	4
+#	define R300_GB_MSPOS1__MS_X4_SHIFT	8
+#	define R300_GB_MSPOS1__MS_Y4_SHIFT	12
+#	define R300_GB_MSPOS1__MS_X5_SHIFT	16
+#	define R300_GB_MSPOS1__MS_Y5_SHIFT	20
+#	define R300_GB_MSPOS1__MSBD1		24
+
+/* Specifies the graphics pipeline configuration for rasterization. */
+#define R300_GB_TILE_CONFIG                      0x4018
+#	define R300_GB_TILE_DISABLE             (0 << 0)
+#	define R300_GB_TILE_ENABLE              (1 << 0)
+#	define R300_GB_TILE_PIPE_COUNT_RV300	(0 << 1) /* RV350 (1 pipe, 1 ctx) */
+#	define R300_GB_TILE_PIPE_COUNT_R300	(3 << 1) /* R300 (2 pipes, 1 ctx) */
+#	define R300_GB_TILE_PIPE_COUNT_R420_3P  (6 << 1) /* R420-3P (3 pipes, 1 ctx) */
+#	define R300_GB_TILE_PIPE_COUNT_R420	(7 << 1) /* R420 (4 pipes, 1 ctx) */
+#	define R300_GB_TILE_SIZE_8		(0 << 4)
+#	define R300_GB_TILE_SIZE_16		(1 << 4)
+#	define R300_GB_TILE_SIZE_32		(2 << 4)
+#	define R300_GB_SUPER_SIZE_1		(0 << 6)
+#	define R300_GB_SUPER_SIZE_2		(1 << 6)
+#	define R300_GB_SUPER_SIZE_4		(2 << 6)
+#	define R300_GB_SUPER_SIZE_8		(3 << 6)
+#	define R300_GB_SUPER_SIZE_16		(4 << 6)
+#	define R300_GB_SUPER_SIZE_32		(5 << 6)
+#	define R300_GB_SUPER_SIZE_64		(6 << 6)
+#	define R300_GB_SUPER_SIZE_128		(7 << 6)
+#	define R300_GB_SUPER_X_SHIFT		9	/* 3 bits wide */
+#	define R300_GB_SUPER_Y_SHIFT		12	/* 3 bits wide */
+#	define R300_GB_SUPER_TILE_A		(0 << 15)
+#	define R300_GB_SUPER_TILE_B		(1 << 15)
+#	define R300_GB_SUBPIXEL_1_12		(0 << 16)
+#	define R300_GB_SUBPIXEL_1_16		(1 << 16)
+#	define GB_TILE_CONFIG_QUADS_PER_RAS_4   (0 << 17)
+#	define GB_TILE_CONFIG_QUADS_PER_RAS_8   (1 << 17)
+#	define GB_TILE_CONFIG_QUADS_PER_RAS_16  (2 << 17)
+#	define GB_TILE_CONFIG_QUADS_PER_RAS_32  (3 << 17)
+#	define GB_TILE_CONFIG_BB_SCAN_INTERCEPT (0 << 19)
+#	define GB_TILE_CONFIG_BB_SCAN_BOUND_BOX (1 << 19)
+#	define GB_TILE_CONFIG_ALT_SCAN_EN_LR    (0 << 20)
+#	define GB_TILE_CONFIG_ALT_SCAN_EN_LRL   (1 << 20)
+#	define GB_TILE_CONFIG_ALT_OFFSET        (0 << 21)
+#	define GB_TILE_CONFIG_SUBPRECISION      (0 << 22)
+#	define GB_TILE_CONFIG_ALT_TILING_DEF    (0 << 23)
+#	define GB_TILE_CONFIG_ALT_TILING_3_2    (1 << 23)
+#	define GB_TILE_CONFIG_Z_EXTENDED_24_1   (0 << 24)
+#	define GB_TILE_CONFIG_Z_EXTENDED_S25_1  (1 << 24)
+
+/* Specifies the sizes of the various FIFO`s in the sc/rs/us. This register must be the first one written */
+#define R300_GB_FIFO_SIZE	0x4024
+	/* each of the following is 2 bits wide */
+#define R300_GB_FIFO_SIZE_32	0
+#define R300_GB_FIFO_SIZE_64	1
+#define R300_GB_FIFO_SIZE_128	2
+#define R300_GB_FIFO_SIZE_256	3
+#	define R300_SC_IFIFO_SIZE_SHIFT	0
+#	define R300_SC_TZFIFO_SIZE_SHIFT	2
+#	define R300_SC_BFIFO_SIZE_SHIFT	4
+
+#	define R300_US_OFIFO_SIZE_SHIFT	12
+#	define R300_US_WFIFO_SIZE_SHIFT	14
+	/* the following use the same constants as above, but meaning is
+	   is times 2 (i.e. instead of 32 words it means 64 */
+#	define R300_RS_TFIFO_SIZE_SHIFT	6
+#	define R300_RS_CFIFO_SIZE_SHIFT	8
+#	define R300_US_RAM_SIZE_SHIFT		10
+	/* watermarks, 3 bits wide */
+#	define R300_RS_HIGHWATER_COL_SHIFT	16
+#	define R300_RS_HIGHWATER_TEX_SHIFT	19
+#	define R300_OFIFO_HIGHWATER_SHIFT	22	/* two bits only */
+#	define R300_CUBE_FIFO_HIGHWATER_COL_SHIFT	24
+
+#define GB_Z_PEQ_CONFIG                          0x4028
+#	define GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_4_4    (0 << 0)
+#	define GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_8_8    (1 << 0)
+
+/* Specifies various polygon specific selects (fog, depth, perspective). */
+#define R300_GB_SELECT                           0x401c
+#	define R300_GB_FOG_SELECT_C0A		(0 << 0)
+#	define R300_GB_FOG_SELECT_C1A           (1 << 0)
+#	define R300_GB_FOG_SELECT_C2A           (2 << 0)
+#	define R300_GB_FOG_SELECT_C3A           (3 << 0)
+#	define R300_GB_FOG_SELECT_1_1_W         (4 << 0)
+#	define R300_GB_FOG_SELECT_Z		(5 << 0)
+#	define R300_GB_DEPTH_SELECT_Z		(0 << 3)
+#	define R300_GB_DEPTH_SELECT_1_1_W	(1 << 3)
+#	define R300_GB_W_SELECT_1_W		(0 << 4)
+#	define R300_GB_W_SELECT_1		(1 << 4)
+#	define R300_GB_FOG_STUFF_DISABLE        (0 << 5)
+#	define R300_GB_FOG_STUFF_ENABLE         (1 << 5)
+#	define R300_GB_FOG_STUFF_TEX_SHIFT      6
+#	define R300_GB_FOG_STUFF_TEX_MASK       0x000003c0
+#	define R300_GB_FOG_STUFF_COMP_SHIFT     10
+#	define R300_GB_FOG_STUFF_COMP_MASK      0x00000c00
+
+/* Specifies the graphics pipeline configuration for antialiasing. */
+#define R300_GB_AA_CONFIG                         0x4020
+#	define GB_AA_CONFIG_AA_DISABLE           (0 << 0)
+#	define GB_AA_CONFIG_AA_ENABLE            (1 << 0)
+#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_2  (0 << 1)
+#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_3  (1 << 1)
+#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_4  (2 << 1)
+#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_6  (3 << 1)
+
+/* Selects which of 4 pipes are active. */
+#define GB_PIPE_SELECT                           0x402c
+#	define GB_PIPE_SELECT_PIPE0_ID_SHIFT  0
+#	define GB_PIPE_SELECT_PIPE1_ID_SHIFT  2
+#	define GB_PIPE_SELECT_PIPE2_ID_SHIFT  4
+#	define GB_PIPE_SELECT_PIPE3_ID_SHIFT  6
+#	define GB_PIPE_SELECT_PIPE_MASK_SHIFT 8
+#	define GB_PIPE_SELECT_MAX_PIPE        12
+#	define GB_PIPE_SELECT_BAD_PIPES       14
+#	define GB_PIPE_SELECT_CONFIG_PIPES    18
+
+
+/* Specifies the sizes of the various FIFO`s in the sc/rs. */
+#define GB_FIFO_SIZE1                            0x4070
+/* High water mark for SC input fifo */
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_SHIFT 0
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_MASK  0x0000003f
+/* High water mark for SC input fifo (B) */
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_SHIFT 6
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_MASK  0x00000fc0
+/* High water mark for RS colors' fifo */
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_COL_SHIFT   12
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_COL_MASK    0x0003f000
+/* High water mark for RS textures' fifo */
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_TEX_SHIFT   18
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_TEX_MASK    0x00fc0000
+
+/* This table specifies the source location and format for up to 16 texture
+ * addresses (i[0]:i[15]) and four colors (c[0]:c[3])
+ */
+#define R500_RS_IP_0					0x4074
+#define R500_RS_IP_1					0x4078
+#define R500_RS_IP_2					0x407C
+#define R500_RS_IP_3					0x4080
+#define R500_RS_IP_4					0x4084
+#define R500_RS_IP_5					0x4088
+#define R500_RS_IP_6					0x408C
+#define R500_RS_IP_7					0x4090
+#define R500_RS_IP_8					0x4094
+#define R500_RS_IP_9					0x4098
+#define R500_RS_IP_10					0x409C
+#define R500_RS_IP_11					0x40A0
+#define R500_RS_IP_12					0x40A4
+#define R500_RS_IP_13					0x40A8
+#define R500_RS_IP_14					0x40AC
+#define R500_RS_IP_15					0x40B0
+#define R500_RS_IP_PTR_K0                               62
+#define R500_RS_IP_PTR_K1                               63
+#define R500_RS_IP_TEX_PTR_S_SHIFT 			0
+#define R500_RS_IP_TEX_PTR_T_SHIFT 			6
+#define R500_RS_IP_TEX_PTR_R_SHIFT 			12
+#define R500_RS_IP_TEX_PTR_Q_SHIFT 			18
+#define R500_RS_IP_COL_PTR_SHIFT 			24
+#define R500_RS_IP_COL_FMT_SHIFT 			27
+#	define R500_RS_COL_PTR(x)		        (x << 24)
+#       define R500_RS_COL_FMT(x)                       (x << 27)
+/* gap */
+#define R500_RS_IP_OFFSET_DIS 				(0 << 31)
+#define R500_RS_IP_OFFSET_EN 				(1 << 31)
+
+/* gap */
+
+/* Zero to flush caches. */
+#define R300_TX_INVALTAGS                   0x4100
+#define R300_TX_FLUSH                       0x0
+
+/* The upper enable bits are guessed, based on fglrx reported limits. */
+#define R300_TX_ENABLE                      0x4104
+#       define R300_TX_ENABLE_0                  (1 << 0)
+#       define R300_TX_ENABLE_1                  (1 << 1)
+#       define R300_TX_ENABLE_2                  (1 << 2)
+#       define R300_TX_ENABLE_3                  (1 << 3)
+#       define R300_TX_ENABLE_4                  (1 << 4)
+#       define R300_TX_ENABLE_5                  (1 << 5)
+#       define R300_TX_ENABLE_6                  (1 << 6)
+#       define R300_TX_ENABLE_7                  (1 << 7)
+#       define R300_TX_ENABLE_8                  (1 << 8)
+#       define R300_TX_ENABLE_9                  (1 << 9)
+#       define R300_TX_ENABLE_10                 (1 << 10)
+#       define R300_TX_ENABLE_11                 (1 << 11)
+#       define R300_TX_ENABLE_12                 (1 << 12)
+#       define R300_TX_ENABLE_13                 (1 << 13)
+#       define R300_TX_ENABLE_14                 (1 << 14)
+#       define R300_TX_ENABLE_15                 (1 << 15)
+
+#define R500_TX_FILTER_4		    0x4110
+#	define R500_TX_WEIGHT_1_SHIFT            (0)
+#	define R500_TX_WEIGHT_0_SHIFT            (11)
+#	define R500_TX_WEIGHT_PAIR               (1<<22)
+#	define R500_TX_PHASE_SHIFT               (23)
+#	define R500_TX_DIRECTION_HORIZONTAL	 (0<<27)
+#	define R500_TX_DIRECTION_VERITCAL	 (1<<27)
+
+/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) */
+#define R300_GA_POINT_S0                              0x4200
+
+/* T Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) */
+#define R300_GA_POINT_T0                              0x4204
+
+/* S Texture Coordinate of Vertex 2 for Point texture stuffing (URC) */
+#define R300_GA_POINT_S1                              0x4208
+
+/* T Texture Coordinate of Vertex 2 for Point texture stuffing (URC) */
+#define R300_GA_POINT_T1                              0x420c
+
+/* Specifies amount to shift integer position of vertex (screen space) before
+ * converting to float for triangle stipple.
+ */
+#define R300_GA_TRIANGLE_STIPPLE            0x4214
+#	define R300_GA_TRIANGLE_STIPPLE_X_SHIFT_SHIFT 0
+#	define R300_GA_TRIANGLE_STIPPLE_X_SHIFT_MASK  0x0000000f
+#	define R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_SHIFT 16
+#	define R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_MASK  0x000f0000
+
+/* The pointsize is given in multiples of 6. The pointsize can be enormous:
+ * Clear() renders a single point that fills the entire framebuffer.
+ * 1/2 Height of point; fixed (16.0), subpixel format (1/12 or 1/16, even if in
+ * 8b precision).
+ */
+#define R300_GA_POINT_SIZE                   0x421C
+#       define R300_POINTSIZE_Y_SHIFT         0
+#       define R300_POINTSIZE_Y_MASK          0x0000ffff
+#       define R300_POINTSIZE_X_SHIFT         16
+#       define R300_POINTSIZE_X_MASK          0xffff0000
+#       define R300_POINTSIZE_MAX             (R300_POINTSIZE_Y_MASK / 6)
+
+/* Blue fill color */
+#define R500_GA_FILL_R                                0x4220
+
+/* Blue fill color */
+#define R500_GA_FILL_G                                0x4224
+
+/* Blue fill color */
+#define R500_GA_FILL_B                                0x4228
+
+/* Alpha fill color */
+#define R500_GA_FILL_A                                0x422c
+
+
+/* Specifies maximum and minimum point & sprite sizes for per vertex size
+ * specification. The lower part (15:0) is MIN and (31:16) is max.
+ */
+#define R300_GA_POINT_MINMAX                0x4230
+#       define R300_GA_POINT_MINMAX_MIN_SHIFT          0
+#       define R300_GA_POINT_MINMAX_MIN_MASK           (0xFFFF << 0)
+#       define R300_GA_POINT_MINMAX_MAX_SHIFT          16
+#       define R300_GA_POINT_MINMAX_MAX_MASK           (0xFFFF << 16)
+
+/* 1/2 width of line, in subpixels (1/12 or 1/16 only, even in 8b
+ * subprecision); (16.0) fixed format.
+ *
+ * The line width is given in multiples of 6.
+ * In default mode lines are classified as vertical lines.
+ * HO: horizontal
+ * VE: vertical or horizontal
+ * HO & VE: no classification
+ */
+#define R300_GA_LINE_CNTL                             0x4234
+#       define R300_GA_LINE_CNTL_WIDTH_SHIFT       0
+#       define R300_GA_LINE_CNTL_WIDTH_MASK        0x0000ffff
+#	define R300_GA_LINE_CNTL_END_TYPE_HOR      (0 << 16)
+#	define R300_GA_LINE_CNTL_END_TYPE_VER      (1 << 16)
+#	define R300_GA_LINE_CNTL_END_TYPE_SQR      (2 << 16) /* horizontal or vertical depending upon slope */
+#	define R300_GA_LINE_CNTL_END_TYPE_COMP     (3 << 16) /* Computed (perpendicular to slope) */
+#	define R500_GA_LINE_CNTL_SORT_NO           (0 << 18)
+#	define R500_GA_LINE_CNTL_SORT_MINX_MINY    (1 << 18)
+/** TODO: looks wrong */
+#       define R300_LINESIZE_MAX              (R300_GA_LINE_CNTL_WIDTH_MASK / 6)
+/** TODO: looks wrong */
+#       define R300_LINE_CNT_HO               (1 << 16)
+/** TODO: looks wrong */
+#       define R300_LINE_CNT_VE               (1 << 17)
+
+/* Line Stipple configuration information. */
+#define R300_GA_LINE_STIPPLE_CONFIG                   0x4238
+#	define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_NO     (0 << 0)
+#	define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_LINE   (1 << 0)
+#	define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_PACKET (2 << 0)
+#	define R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_SHIFT 2
+#	define R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_MASK  0xfffffffc
+
+/* Used to load US instructions and constants */
+#define R500_GA_US_VECTOR_INDEX               0x4250
+#	define R500_GA_US_VECTOR_INDEX_SHIFT       0
+#	define R500_GA_US_VECTOR_INDEX_MASK        0x000000ff
+#	define R500_GA_US_VECTOR_INDEX_TYPE_INSTR  (0 << 16)
+#	define R500_GA_US_VECTOR_INDEX_TYPE_CONST  (1 << 16)
+#	define R500_GA_US_VECTOR_INDEX_CLAMP_NO    (0 << 17)
+#	define R500_GA_US_VECTOR_INDEX_CLAMP_CONST (1 << 17)
+
+/* Data register for loading US instructions and constants */
+#define R500_GA_US_VECTOR_DATA                0x4254
+
+/* Specifies color properties and mappings of textures. */
+#define R500_GA_COLOR_CONTROL_PS3                     0x4258
+#	define R500_TEX0_SHADING_PS3_SOLID       (0 << 0)
+#	define R500_TEX0_SHADING_PS3_FLAT        (1 << 0)
+#	define R500_TEX0_SHADING_PS3_GOURAUD     (2 << 0)
+#	define R500_TEX1_SHADING_PS3_SOLID       (0 << 2)
+#	define R500_TEX1_SHADING_PS3_FLAT        (1 << 2)
+#	define R500_TEX1_SHADING_PS3_GOURAUD     (2 << 2)
+#	define R500_TEX2_SHADING_PS3_SOLID       (0 << 4)
+#	define R500_TEX2_SHADING_PS3_FLAT        (1 << 4)
+#	define R500_TEX2_SHADING_PS3_GOURAUD     (2 << 4)
+#	define R500_TEX3_SHADING_PS3_SOLID       (0 << 6)
+#	define R500_TEX3_SHADING_PS3_FLAT        (1 << 6)
+#	define R500_TEX3_SHADING_PS3_GOURAUD     (2 << 6)
+#	define R500_TEX4_SHADING_PS3_SOLID       (0 << 8)
+#	define R500_TEX4_SHADING_PS3_FLAT        (1 << 8)
+#	define R500_TEX4_SHADING_PS3_GOURAUD     (2 << 8)
+#	define R500_TEX5_SHADING_PS3_SOLID       (0 << 10)
+#	define R500_TEX5_SHADING_PS3_FLAT        (1 << 10)
+#	define R500_TEX5_SHADING_PS3_GOURAUD     (2 << 10)
+#	define R500_TEX6_SHADING_PS3_SOLID       (0 << 12)
+#	define R500_TEX6_SHADING_PS3_FLAT        (1 << 12)
+#	define R500_TEX6_SHADING_PS3_GOURAUD     (2 << 12)
+#	define R500_TEX7_SHADING_PS3_SOLID       (0 << 14)
+#	define R500_TEX7_SHADING_PS3_FLAT        (1 << 14)
+#	define R500_TEX7_SHADING_PS3_GOURAUD     (2 << 14)
+#	define R500_TEX8_SHADING_PS3_SOLID       (0 << 16)
+#	define R500_TEX8_SHADING_PS3_FLAT        (1 << 16)
+#	define R500_TEX8_SHADING_PS3_GOURAUD     (2 << 16)
+#	define R500_TEX9_SHADING_PS3_SOLID       (0 << 18)
+#	define R500_TEX9_SHADING_PS3_FLAT        (1 << 18)
+#	define R500_TEX9_SHADING_PS3_GOURAUD     (2 << 18)
+#	define R500_TEX10_SHADING_PS3_SOLID      (0 << 20)
+#	define R500_TEX10_SHADING_PS3_FLAT       (1 << 20)
+#	define R500_TEX10_SHADING_PS3_GOURAUD    (2 << 20)
+#	define R500_COLOR0_TEX_OVERRIDE_NO       (0 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_0    (1 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_1    (2 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_2    (3 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_3    (4 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_4    (5 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_5    (6 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_6    (7 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_7    (8 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_8_C2 (9 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_9_C3 (10 << 22)
+#	define R500_COLOR1_TEX_OVERRIDE_NO       (0 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_0    (1 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_1    (2 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_2    (3 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_3    (4 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_4    (5 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_5    (6 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_6    (7 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_7    (8 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_8_C2 (9 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_9_C3 (10 << 26)
+
+/* Returns idle status of various G3D block, captured when GA_IDLE written or
+ * when hard or soft reset asserted.
+ */
+#define R500_GA_IDLE                                  0x425c
+#	define R500_GA_IDLE_PIPE3_Z_IDLE  (0 << 0)
+#	define R500_GA_IDLE_PIPE2_Z_IDLE  (0 << 1)
+#	define R500_GA_IDLE_PIPE3_CD_IDLE (0 << 2)
+#	define R500_GA_IDLE_PIPE2_CD_IDLE (0 << 3)
+#	define R500_GA_IDLE_PIPE3_FG_IDLE (0 << 4)
+#	define R500_GA_IDLE_PIPE2_FG_IDLE (0 << 5)
+#	define R500_GA_IDLE_PIPE3_US_IDLE (0 << 6)
+#	define R500_GA_IDLE_PIPE2_US_IDLE (0 << 7)
+#	define R500_GA_IDLE_PIPE3_SC_IDLE (0 << 8)
+#	define R500_GA_IDLE_PIPE2_SC_IDLE (0 << 9)
+#	define R500_GA_IDLE_PIPE3_RS_IDLE (0 << 10)
+#	define R500_GA_IDLE_PIPE2_RS_IDLE (0 << 11)
+#	define R500_GA_IDLE_PIPE1_Z_IDLE  (0 << 12)
+#	define R500_GA_IDLE_PIPE0_Z_IDLE  (0 << 13)
+#	define R500_GA_IDLE_PIPE1_CD_IDLE (0 << 14)
+#	define R500_GA_IDLE_PIPE0_CD_IDLE (0 << 15)
+#	define R500_GA_IDLE_PIPE1_FG_IDLE (0 << 16)
+#	define R500_GA_IDLE_PIPE0_FG_IDLE (0 << 17)
+#	define R500_GA_IDLE_PIPE1_US_IDLE (0 << 18)
+#	define R500_GA_IDLE_PIPE0_US_IDLE (0 << 19)
+#	define R500_GA_IDLE_PIPE1_SC_IDLE (0 << 20)
+#	define R500_GA_IDLE_PIPE0_SC_IDLE (0 << 21)
+#	define R500_GA_IDLE_PIPE1_RS_IDLE (0 << 22)
+#	define R500_GA_IDLE_PIPE0_RS_IDLE (0 << 23)
+#	define R500_GA_IDLE_SU_IDLE       (0 << 24)
+#	define R500_GA_IDLE_GA_IDLE       (0 << 25)
+#	define R500_GA_IDLE_GA_UNIT2_IDLE (0 << 26)
+
+/* Current value of stipple accumulator. */
+#define R300_GA_LINE_STIPPLE_VALUE            0x4260
+
+/* S Texture Coordinate Value for Vertex 0 of Line (stuff textures -- i.e. AA) */
+#define R300_GA_LINE_S0                               0x4264
+/* S Texture Coordinate Value for Vertex 1 of Lines (V2 of parallelogram -- stuff textures -- i.e. AA) */
+#define R300_GA_LINE_S1                               0x4268
+
+/* GA Input fifo high water marks */
+#define R500_GA_FIFO_CNTL                             0x4270
+#	define R500_GA_FIFO_CNTL_VERTEX_FIFO_MASK   0x00000007
+#	define R500_GA_FIFO_CNTL_VERTEX_FIFO_SHIFT  0
+#	define R500_GA_FIFO_CNTL_VERTEX_INDEX_MASK  0x00000038
+#	define R500_GA_FIFO_CNTL_VERTEX_INDEX_SHIFT 3
+#	define R500_GA_FIFO_CNTL_VERTEX_REG_MASK    0x00003fc0
+#	define R500_GA_FIFO_CNTL_VERTEX_REG_SHIFT   6
+
+/* GA enhance/tweaks */
+#define R300_GA_ENHANCE                               0x4274
+#	define R300_GA_ENHANCE_DEADLOCK_CNTL_NO_EFFECT   (0 << 0)
+#	define R300_GA_ENHANCE_DEADLOCK_CNTL_PREVENT_TCL (1 << 0) /* Prevents TCL interface from deadlocking on GA side. */
+#	define R300_GA_ENHANCE_FASTSYNC_CNTL_NO_EFFECT   (0 << 1)
+#	define R300_GA_ENHANCE_FASTSYNC_CNTL_ENABLE      (1 << 1) /* Enables high-performance register/primitive switching. */
+#	define R500_GA_ENHANCE_REG_READWRITE_NO_EFFECT   (0 << 2) /* R520+ only */
+#	define R500_GA_ENHANCE_REG_READWRITE_ENABLE      (1 << 2) /* R520+ only, Enables GA support of simultaneous register reads and writes. */
+#	define R500_GA_ENHANCE_REG_NOSTALL_NO_EFFECT     (0 << 3)
+#	define R500_GA_ENHANCE_REG_NOSTALL_ENABLE        (1 << 3) /* Enables GA support of no-stall reads for register read back. */
+
+#define R300_GA_COLOR_CONTROL                   0x4278
+#	define R300_GA_COLOR_CONTROL_RGB0_SHADING_SOLID      (0 << 0)
+#	define R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT       (1 << 0)
+#	define R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD    (2 << 0)
+#	define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_SOLID    (0 << 2)
+#	define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT     (1 << 2)
+#	define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD  (2 << 2)
+#	define R300_GA_COLOR_CONTROL_RGB1_SHADING_SOLID      (0 << 4)
+#	define R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT       (1 << 4)
+#	define R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD    (2 << 4)
+#	define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_SOLID    (0 << 6)
+#	define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_FLAT     (1 << 6)
+#	define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD  (2 << 6)
+#	define R300_GA_COLOR_CONTROL_RGB2_SHADING_SOLID      (0 << 8)
+#	define R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT       (1 << 8)
+#	define R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD    (2 << 8)
+#	define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_SOLID    (0 << 10)
+#	define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT     (1 << 10)
+#	define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD  (2 << 10)
+#	define R300_GA_COLOR_CONTROL_RGB3_SHADING_SOLID      (0 << 12)
+#	define R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT       (1 << 12)
+#	define R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD    (2 << 12)
+#	define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_SOLID    (0 << 14)
+#	define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_FLAT     (1 << 14)
+#	define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD  (2 << 14)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_FIRST  (0 << 16)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_SECOND (1 << 16)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_THIRD  (2 << 16)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST   (3 << 16)
+
+/** TODO: might be candidate for removal */
+#	define R300_RE_SHADE_MODEL_SMOOTH     ( \
+	R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+/** TODO: might be candidate for removal, the GOURAUD stuff also looks buggy to me */
+#	define R300_RE_SHADE_MODEL_FLAT     ( \
+	R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT | \
+	R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT | \
+	R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+
+/* Specifies red & green components of fill color -- S312 format -- Backwards comp. */
+#define R300_GA_SOLID_RG                         0x427c
+#	define GA_SOLID_RG_COLOR_GREEN_SHIFT 0
+#	define GA_SOLID_RG_COLOR_GREEN_MASK  0x0000ffff
+#	define GA_SOLID_RG_COLOR_RED_SHIFT   16
+#	define GA_SOLID_RG_COLOR_RED_MASK    0xffff0000
+/* Specifies blue & alpha components of fill color -- S312 format -- Backwards comp. */
+#define R300_GA_SOLID_BA                         0x4280
+#	define GA_SOLID_BA_COLOR_ALPHA_SHIFT 0
+#	define GA_SOLID_BA_COLOR_ALPHA_MASK  0x0000ffff
+#	define GA_SOLID_BA_COLOR_BLUE_SHIFT  16
+#	define GA_SOLID_BA_COLOR_BLUE_MASK   0xffff0000
+
+/* Polygon Mode
+ * Dangerous
+ */
+#define R300_GA_POLY_MODE                             0x4288
+#	define R300_GA_POLY_MODE_DISABLE           (0 << 0)
+#	define R300_GA_POLY_MODE_DUAL              (1 << 0) /* send 2 sets of 3 polys with specified poly type */
+/* reserved */
+#	define R300_GA_POLY_MODE_FRONT_PTYPE_POINT (0 << 4)
+#	define R300_GA_POLY_MODE_FRONT_PTYPE_LINE  (1 << 4)
+#	define R300_GA_POLY_MODE_FRONT_PTYPE_TRI   (2 << 4)
+/* reserved */
+#	define R300_GA_POLY_MODE_BACK_PTYPE_POINT  (0 << 7)
+#	define R300_GA_POLY_MODE_BACK_PTYPE_LINE   (1 << 7)
+#	define R300_GA_POLY_MODE_BACK_PTYPE_TRI    (2 << 7)
+/* reserved */
+
+/* Specifies the rouding mode for geometry & color SPFP to FP conversions. */
+#define R300_GA_ROUND_MODE                            0x428c
+#	define R300_GA_ROUND_MODE_GEOMETRY_ROUND_TRUNC   (0 << 0)
+#	define R300_GA_ROUND_MODE_GEOMETRY_ROUND_NEAREST (1 << 0)
+#	define R300_GA_ROUND_MODE_COLOR_ROUND_TRUNC      (0 << 2)
+#	define R300_GA_ROUND_MODE_COLOR_ROUND_NEAREST    (1 << 2)
+#	define R300_GA_ROUND_MODE_RGB_CLAMP_RGB          (0 << 4)
+#	define R300_GA_ROUND_MODE_RGB_CLAMP_FP20         (1 << 4)
+#	define R300_GA_ROUND_MODE_ALPHA_CLAMP_RGB        (0 << 5)
+#	define R300_GA_ROUND_MODE_ALPHA_CLAMP_FP20       (1 << 5)
+#	define R500_GA_ROUND_MODE_GEOMETRY_MASK_SHIFT    6
+#	define R500_GA_ROUND_MODE_GEOMETRY_MASK_MASK     0x000003c0
+
+/* Specifies x & y offsets for vertex data after conversion to FP.
+ * Offsets are in S15 format (subpixels -- 1/12 or 1/16, even in 8b
+ * subprecision).
+ */
+#define R300_GA_OFFSET                                0x4290
+#	define R300_GA_OFFSET_X_OFFSET_SHIFT 0
+#	define R300_GA_OFFSET_X_OFFSET_MASK  0x0000ffff
+#	define R300_GA_OFFSET_Y_OFFSET_SHIFT 16
+#	define R300_GA_OFFSET_Y_OFFSET_MASK  0xffff0000
+
+/* Specifies the scale to apply to fog. */
+#define R300_GA_FOG_SCALE                     0x4294
+/* Specifies the offset to apply to fog. */
+#define R300_GA_FOG_OFFSET                    0x4298
+/* Specifies number of cycles to assert reset, and also causes RB3D soft reset to assert. */
+#define R300_GA_SOFT_RESET                    0x429c
+
+/* Not sure why there are duplicate of factor and constant values.
+ * My best guess so far is that there are seperate zbiases for test and write.
+ * Ordering might be wrong.
+ * Some of the tests indicate that fgl has a fallback implementation of zbias
+ * via pixel shaders.
+ */
+#define R300_SU_TEX_WRAP                      0x42A0
+#define R300_SU_POLY_OFFSET_FRONT_SCALE       0x42A4
+#define R300_SU_POLY_OFFSET_FRONT_OFFSET      0x42A8
+#define R300_SU_POLY_OFFSET_BACK_SCALE        0x42AC
+#define R300_SU_POLY_OFFSET_BACK_OFFSET       0x42B0
+
+/* This register needs to be set to (1<<1) for RV350 to correctly
+ * perform depth test (see --vb-triangles in r300_demo)
+ * Don't know about other chips. - Vladimir
+ * This is set to 3 when GL_POLYGON_OFFSET_FILL is on.
+ * My guess is that there are two bits for each zbias primitive
+ * (FILL, LINE, POINT).
+ *  One to enable depth test and one for depth write.
+ * Yet this doesnt explain why depth writes work ...
+ */
+#define R300_SU_POLY_OFFSET_ENABLE	       0x42B4
+#	define R300_FRONT_ENABLE	       (1 << 0)
+#	define R300_BACK_ENABLE 	       (1 << 1)
+#	define R300_PARA_ENABLE 	       (1 << 2)
+
+#define R300_SU_CULL_MODE                      0x42B8
+#       define R300_CULL_FRONT                   (1 << 0)
+#       define R300_CULL_BACK                    (1 << 1)
+#       define R300_FRONT_FACE_CCW               (0 << 2)
+#       define R300_FRONT_FACE_CW                (1 << 2)
+
+/* SU Depth Scale value */
+#define R300_SU_DEPTH_SCALE                 0x42c0
+/* SU Depth Offset value */
+#define R300_SU_DEPTH_OFFSET                0x42c4
+
+
+/* BEGIN: Rasterization / Interpolators - many guesses */
+
+/*
+ * TC_CNT is the number of incoming texture coordinate sets (i.e. it depends
+ * on the vertex program, *not* the fragment program)
+ */
+#define R300_RS_COUNT                      0x4300
+#       define R300_IT_COUNT_SHIFT               0
+#       define R300_IT_COUNT_MASK                0x0000007f
+#       define R300_IC_COUNT_SHIFT               7
+#       define R300_IC_COUNT_MASK                0x00000780
+#       define R300_W_ADDR_SHIFT                 12
+#       define R300_W_ADDR_MASK                  0x0003f000
+#       define R300_HIRES_DIS                    (0 << 18)
+#       define R300_HIRES_EN                     (1 << 18)
+
+#define R300_RS_INST_COUNT                       0x4304
+#       define R300_RS_INST_COUNT_SHIFT          0
+#       define R300_RS_INST_COUNT_MASK           0x0000000f
+#       define R300_RS_TX_OFFSET_SHIFT           5
+#	define R300_RS_TX_OFFSET_MASK            0x000000e0
+
+/* gap */
+
+/* Only used for texture coordinates.
+ * Use the source field to route texture coordinate input from the
+ * vertex program to the desired interpolator. Note that the source
+ * field is relative to the outputs the vertex program *actually*
+ * writes. If a vertex program only writes texcoord[1], this will
+ * be source index 0.
+ * Set INTERP_USED on all interpolators that produce data used by
+ * the fragment program. INTERP_USED looks like a swizzling mask,
+ * but I haven't seen it used that way.
+ *
+ * Note: The _UNKNOWN constants are always set in their respective
+ * register. I don't know if this is necessary.
+ */
+#define R300_RS_IP_0				        0x4310
+#define R300_RS_IP_1				        0x4314
+#define R300_RS_IP_2				        0x4318
+#define R300_RS_IP_3				        0x431C
+#       define R300_RS_INTERP_SRC_SHIFT          2 /* TODO: check for removal */
+#       define R300_RS_INTERP_SRC_MASK           (7 << 2) /* TODO: check for removal */
+#	define R300_RS_TEX_PTR(x)		        (x << 0)
+#	define R300_RS_COL_PTR(x)		        (x << 6)
+#	define R300_RS_COL_FMT(x)		        (x << 9)
+#	define R300_RS_COL_FMT_RGBA		        0
+#	define R300_RS_COL_FMT_RGB0		        1
+#	define R300_RS_COL_FMT_RGB1		        2
+#	define R300_RS_COL_FMT_000A		        4
+#	define R300_RS_COL_FMT_0000		        5
+#	define R300_RS_COL_FMT_0001		        6
+#	define R300_RS_COL_FMT_111A		        8
+#	define R300_RS_COL_FMT_1110		        9
+#	define R300_RS_COL_FMT_1111		        10
+#	define R300_RS_SEL_S(x)		                (x << 13)
+#	define R300_RS_SEL_T(x)		                (x << 16)
+#	define R300_RS_SEL_R(x)		                (x << 19)
+#	define R300_RS_SEL_Q(x)		                (x << 22)
+#	define R300_RS_SEL_C0		                0
+#	define R300_RS_SEL_C1		                1
+#	define R300_RS_SEL_C2		                2
+#	define R300_RS_SEL_C3		                3
+#	define R300_RS_SEL_K0		                4
+#	define R300_RS_SEL_K1		                5
+
+
+/*  */
+#define R500_RS_INST_0					0x4320
+#define R500_RS_INST_1					0x4324
+#define R500_RS_INST_2					0x4328
+#define R500_RS_INST_3					0x432c
+#define R500_RS_INST_4					0x4330
+#define R500_RS_INST_5					0x4334
+#define R500_RS_INST_6					0x4338
+#define R500_RS_INST_7					0x433c
+#define R500_RS_INST_8					0x4340
+#define R500_RS_INST_9					0x4344
+#define R500_RS_INST_10					0x4348
+#define R500_RS_INST_11					0x434c
+#define R500_RS_INST_12					0x4350
+#define R500_RS_INST_13					0x4354
+#define R500_RS_INST_14					0x4358
+#define R500_RS_INST_15					0x435c
+#define R500_RS_INST_TEX_ID_SHIFT			0
+#define R500_RS_INST_TEX_CN_WRITE			(1 << 4)
+#define R500_RS_INST_TEX_ADDR_SHIFT			5
+#define R500_RS_INST_COL_ID_SHIFT			12
+#define R500_RS_INST_COL_CN_NO_WRITE			(0 << 16)
+#define R500_RS_INST_COL_CN_WRITE			(1 << 16)
+#define R500_RS_INST_COL_CN_WRITE_FBUFFER		(2 << 16)
+#define R500_RS_INST_COL_CN_WRITE_BACKFACE		(3 << 16)
+#define R500_RS_INST_COL_ADDR_SHIFT			18
+#define R500_RS_INST_TEX_ADJ				(1 << 25)
+#define R500_RS_INST_W_CN				(1 << 26)
+
+/* These DWORDs control how vertex data is routed into fragment program
+ * registers, after interpolators.
+ */
+#define R300_RS_INST_0                     0x4330
+#define R300_RS_INST_1                     0x4334
+#define R300_RS_INST_2                     0x4338
+#define R300_RS_INST_3                     0x433C
+#define R300_RS_INST_4                     0x4340
+#define R300_RS_INST_5                     0x4344
+#define R300_RS_INST_6                     0x4348
+#define R300_RS_INST_7                     0x434C
+#	define R300_RS_INST_TEX_ID(x)  		((x) << 0)
+#	define R300_RS_INST_TEX_CN_WRITE 	(1 << 3)
+#	define R300_RS_INST_TEX_ADDR_SHIFT 	6
+#	define R300_RS_INST_COL_ID(x)		((x) << 11)
+#	define R300_RS_INST_COL_CN_WRITE	(1 << 14)
+#	define R300_RS_INST_COL_ADDR_SHIFT	17
+#	define R300_RS_INST_TEX_ADJ		(1 << 22)
+#	define R300_RS_COL_BIAS_UNUSED_SHIFT    23
+
+/* END: Rasterization / Interpolators - many guesses */
+
+/* Hierarchical Z Enable */
+#define R300_SC_HYPERZ                   0x43a4
+#	define R300_SC_HYPERZ_DISABLE     (0 << 0)
+#	define R300_SC_HYPERZ_ENABLE      (1 << 0)
+#	define R300_SC_HYPERZ_MIN         (0 << 1)
+#	define R300_SC_HYPERZ_MAX         (1 << 1)
+#	define R300_SC_HYPERZ_ADJ_256     (0 << 2)
+#	define R300_SC_HYPERZ_ADJ_128     (1 << 2)
+#	define R300_SC_HYPERZ_ADJ_64      (2 << 2)
+#	define R300_SC_HYPERZ_ADJ_32      (3 << 2)
+#	define R300_SC_HYPERZ_ADJ_16      (4 << 2)
+#	define R300_SC_HYPERZ_ADJ_8       (5 << 2)
+#	define R300_SC_HYPERZ_ADJ_4       (6 << 2)
+#	define R300_SC_HYPERZ_ADJ_2       (7 << 2)
+#	define R300_SC_HYPERZ_HZ_Z0MIN_NO (0 << 5)
+#	define R300_SC_HYPERZ_HZ_Z0MIN    (1 << 5)
+#	define R300_SC_HYPERZ_HZ_Z0MAX_NO (0 << 6)
+#	define R300_SC_HYPERZ_HZ_Z0MAX    (1 << 6)
+
+#define R300_SC_EDGERULE                 0x43a8
+
+/* BEGIN: Scissors and cliprects */
+
+/* There are four clipping rectangles. Their corner coordinates are inclusive.
+ * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+ * on whether the pixel is inside cliprects 0-3, respectively. For example,
+ * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+ * the number 3 (binary 0011).
+ * Iff the bit corresponding to the pixel's number in RE_CLIPRECT_CNTL is set,
+ * the pixel is rasterized.
+ *
+ * In addition to this, there is a scissors rectangle. Only pixels inside the
+ * scissors rectangle are drawn. (coordinates are inclusive)
+ *
+ * For some reason, the top-left corner of the framebuffer is at (1440, 1440)
+ * for the purpose of clipping and scissors.
+ */
+#define R300_SC_CLIPRECT_TL_0               0x43B0
+#define R300_SC_CLIPRECT_BR_0               0x43B4
+#define R300_SC_CLIPRECT_TL_1               0x43B8
+#define R300_SC_CLIPRECT_BR_1               0x43BC
+#define R300_SC_CLIPRECT_TL_2               0x43C0
+#define R300_SC_CLIPRECT_BR_2               0x43C4
+#define R300_SC_CLIPRECT_TL_3               0x43C8
+#define R300_SC_CLIPRECT_BR_3               0x43CC
+#       define R300_CLIPRECT_OFFSET              1440
+#       define R300_CLIPRECT_MASK                0x1FFF
+#       define R300_CLIPRECT_X_SHIFT             0
+#       define R300_CLIPRECT_X_MASK              (0x1FFF << 0)
+#       define R300_CLIPRECT_Y_SHIFT             13
+#       define R300_CLIPRECT_Y_MASK              (0x1FFF << 13)
+#define R300_SC_CLIP_RULE                   0x43D0
+#       define R300_CLIP_OUT                     (1 << 0)
+#       define R300_CLIP_0                       (1 << 1)
+#       define R300_CLIP_1                       (1 << 2)
+#       define R300_CLIP_10                      (1 << 3)
+#       define R300_CLIP_2                       (1 << 4)
+#       define R300_CLIP_20                      (1 << 5)
+#       define R300_CLIP_21                      (1 << 6)
+#       define R300_CLIP_210                     (1 << 7)
+#       define R300_CLIP_3                       (1 << 8)
+#       define R300_CLIP_30                      (1 << 9)
+#       define R300_CLIP_31                      (1 << 10)
+#       define R300_CLIP_310                     (1 << 11)
+#       define R300_CLIP_32                      (1 << 12)
+#       define R300_CLIP_320                     (1 << 13)
+#       define R300_CLIP_321                     (1 << 14)
+#       define R300_CLIP_3210                    (1 << 15)
+
+/* gap */
+
+#define R300_SC_SCISSORS_TL                 0x43E0
+#define R300_SC_SCISSORS_BR                 0x43E4
+#       define R300_SCISSORS_OFFSET              1440
+#       define R300_SCISSORS_X_SHIFT             0
+#       define R300_SCISSORS_X_MASK              (0x1FFF << 0)
+#       define R300_SCISSORS_Y_SHIFT             13
+#       define R300_SCISSORS_Y_MASK              (0x1FFF << 13)
+
+/* Screen door sample mask */
+#define R300_SC_SCREENDOOR                 0x43e8
+
+/* END: Scissors and cliprects */
+
+/* BEGIN: Texture specification */
+
+/*
+ * The texture specification dwords are grouped by meaning and not by texture
+ * unit. This means that e.g. the offset for texture image unit N is found in
+ * register TX_OFFSET_0 + (4*N)
+ */
+#define R300_TX_FILTER0_0                        0x4400
+#define R300_TX_FILTER0_1                        0x4404
+#define R300_TX_FILTER0_2                        0x4408
+#define R300_TX_FILTER0_3                        0x440c
+#define R300_TX_FILTER0_4                        0x4410
+#define R300_TX_FILTER0_5                        0x4414
+#define R300_TX_FILTER0_6                        0x4418
+#define R300_TX_FILTER0_7                        0x441c
+#define R300_TX_FILTER0_8                        0x4420
+#define R300_TX_FILTER0_9                        0x4424
+#define R300_TX_FILTER0_10                       0x4428
+#define R300_TX_FILTER0_11                       0x442c
+#define R300_TX_FILTER0_12                       0x4430
+#define R300_TX_FILTER0_13                       0x4434
+#define R300_TX_FILTER0_14                       0x4438
+#define R300_TX_FILTER0_15                       0x443c
+#       define R300_TX_REPEAT                    0
+#       define R300_TX_MIRRORED                  1
+#       define R300_TX_CLAMP_TO_EDGE             2
+#	define R300_TX_MIRROR_ONCE_TO_EDGE       3
+#       define R300_TX_CLAMP                     4
+#	define R300_TX_MIRROR_ONCE               5
+#       define R300_TX_CLAMP_TO_BORDER           6
+#	define R300_TX_MIRROR_ONCE_TO_BORDER     7
+#       define R300_TX_WRAP_S_SHIFT              0
+#       define R300_TX_WRAP_S_MASK               (7 << 0)
+#       define R300_TX_WRAP_T_SHIFT              3
+#       define R300_TX_WRAP_T_MASK               (7 << 3)
+#       define R300_TX_WRAP_R_SHIFT              6
+#       define R300_TX_WRAP_R_MASK               (7 << 6)
+#	define R300_TX_MAG_FILTER_4              (0 << 9)
+#       define R300_TX_MAG_FILTER_NEAREST        (1 << 9)
+#       define R300_TX_MAG_FILTER_LINEAR         (2 << 9)
+#       define R300_TX_MAG_FILTER_ANISO          (3 << 9)
+#       define R300_TX_MAG_FILTER_MASK           (3 << 9)
+#       define R300_TX_MIN_FILTER_NEAREST        (1 << 11)
+#       define R300_TX_MIN_FILTER_LINEAR         (2 << 11)
+#	define R300_TX_MIN_FILTER_ANISO          (3 << 11)
+#	define R300_TX_MIN_FILTER_MASK           (3 << 11)
+#	define R300_TX_MIN_FILTER_MIP_NONE       (0 << 13)
+#	define R300_TX_MIN_FILTER_MIP_NEAREST    (1 << 13)
+#	define R300_TX_MIN_FILTER_MIP_LINEAR     (2 << 13)
+#	define R300_TX_MIN_FILTER_MIP_MASK       (3 << 13)
+#	define R300_TX_MAX_ANISO_1_TO_1          (0 << 21)
+#	define R300_TX_MAX_ANISO_2_TO_1          (1 << 21)
+#	define R300_TX_MAX_ANISO_4_TO_1          (2 << 21)
+#	define R300_TX_MAX_ANISO_8_TO_1          (3 << 21)
+#	define R300_TX_MAX_ANISO_16_TO_1         (4 << 21)
+#	define R300_TX_MAX_ANISO_MASK            (7 << 21)
+
+#define R300_TX_FILTER1_0                      0x4440
+#	define R300_CHROMA_KEY_MODE_DISABLE    0
+#	define R300_CHROMA_KEY_FORCE	       1
+#	define R300_CHROMA_KEY_BLEND           2
+#	define R300_MC_ROUND_NORMAL            (0<<2)
+#	define R300_MC_ROUND_MPEG4             (1<<2)
+#	define R300_LOD_BIAS_SHIFT             3
+#	define R300_LOD_BIAS_MASK	       0x1ff8
+#	define R300_EDGE_ANISO_EDGE_DIAG       (0<<13)
+#	define R300_EDGE_ANISO_EDGE_ONLY       (1<<13)
+#	define R300_MC_COORD_TRUNCATE_DISABLE  (0<<14)
+#	define R300_MC_COORD_TRUNCATE_MPEG     (1<<14)
+#	define R300_TX_TRI_PERF_0_8            (0<<15)
+#	define R300_TX_TRI_PERF_1_8            (1<<15)
+#	define R300_TX_TRI_PERF_1_4            (2<<15)
+#	define R300_TX_TRI_PERF_3_8            (3<<15)
+#	define R300_ANISO_THRESHOLD_MASK       (7<<17)
+
+#	define R500_MACRO_SWITCH               (1<<22)
+#	define R500_BORDER_FIX                 (1<<31)
+
+#define R300_TX_SIZE_0                      0x4480
+#       define R300_TX_WIDTHMASK_SHIFT           0
+#       define R300_TX_WIDTHMASK_MASK            (2047 << 0)
+#       define R300_TX_HEIGHTMASK_SHIFT          11
+#       define R300_TX_HEIGHTMASK_MASK           (2047 << 11)
+#	define R300_TX_DEPTHMASK_SHIFT		 22
+#	define R300_TX_DEPTHMASK_MASK		 (0xf << 22)
+#       define R300_TX_MAX_MIP_LEVEL_SHIFT       26
+#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 26)
+#       define R300_TX_SIZE_PROJECTED            (1<<30)
+#       define R300_TX_SIZE_TXPITCH_EN           (1<<31)
+#define R300_TX_FORMAT_0                    0x44C0
+	/* The interpretation of the format word by Wladimir van der Laan */
+	/* The X, Y, Z and W refer to the layout of the components.
+	   They are given meanings as R, G, B and Alpha by the swizzle
+	   specification */
+#	define R300_TX_FORMAT_X8		    0x0
+#	define R500_TX_FORMAT_X1		    0x0 // bit set in format 2
+#	define R300_TX_FORMAT_X16		    0x1
+#	define R500_TX_FORMAT_X1_REV		    0x0 // bit set in format 2
+#	define R300_TX_FORMAT_Y4X4		    0x2
+#	define R300_TX_FORMAT_Y8X8		    0x3
+#	define R300_TX_FORMAT_Y16X16		    0x4
+#	define R300_TX_FORMAT_Z3Y3X2		    0x5
+#	define R300_TX_FORMAT_Z5Y6X5		    0x6
+#	define R300_TX_FORMAT_Z6Y5X5		    0x7
+#	define R300_TX_FORMAT_Z11Y11X10		    0x8
+#	define R300_TX_FORMAT_Z10Y11X11		    0x9
+#	define R300_TX_FORMAT_W4Z4Y4X4		    0xA
+#	define R300_TX_FORMAT_W1Z5Y5X5		    0xB
+#	define R300_TX_FORMAT_W8Z8Y8X8		    0xC
+#	define R300_TX_FORMAT_W2Z10Y10X10	    0xD
+#	define R300_TX_FORMAT_W16Z16Y16X16	    0xE
+#	define R300_TX_FORMAT_DXT1	    	    0xF
+#	define R300_TX_FORMAT_DXT3	    	    0x10
+#	define R300_TX_FORMAT_DXT5	    	    0x11
+#	define R300_TX_FORMAT_D3DMFT_CxV8U8	    0x12     /* no swizzle */
+#	define R300_TX_FORMAT_A8R8G8B8	    	    0x13     /* no swizzle */
+#	define R300_TX_FORMAT_B8G8_B8G8	    	    0x14     /* no swizzle */
+#	define R300_TX_FORMAT_G8R8_G8B8	    	    0x15     /* no swizzle */
+
+	/* These two values are wrong, but they're the only values that
+	 * produce any even vaguely correct results.  Can r300 only do 16-bit
+	 * depth textures?
+	 */
+#	define R300_TX_FORMAT_X24_Y8	    	    0x1e
+#	define R300_TX_FORMAT_X32	    	    0x1e
+
+	/* 0x16 - some 16 bit green format.. ?? */
+#	define R300_TX_FORMAT_3D		   (1 << 25)
+#	define R300_TX_FORMAT_CUBIC_MAP		   (2 << 25)
+
+	/* gap */
+	/* Floating point formats */
+	/* Note - hardware supports both 16 and 32 bit floating point */
+#	define R300_TX_FORMAT_FL_I16	    	    0x18
+#	define R300_TX_FORMAT_FL_I16A16	    	    0x19
+#	define R300_TX_FORMAT_FL_R16G16B16A16	    0x1A
+#	define R300_TX_FORMAT_FL_I32	    	    0x1B
+#	define R300_TX_FORMAT_FL_I32A32	    	    0x1C
+#	define R300_TX_FORMAT_FL_R32G32B32A32	    0x1D
+	/* alpha modes, convenience mostly */
+	/* if you have alpha, pick constant appropriate to the
+	   number of channels (1 for I8, 2 for I8A8, 4 for R8G8B8A8, etc */
+# 	define R300_TX_FORMAT_ALPHA_1CH		    0x000
+# 	define R300_TX_FORMAT_ALPHA_2CH		    0x200
+# 	define R300_TX_FORMAT_ALPHA_4CH		    0x600
+# 	define R300_TX_FORMAT_ALPHA_NONE	    0xA00
+	/* Swizzling */
+	/* constants */
+#	define R300_TX_FORMAT_X		0
+#	define R300_TX_FORMAT_Y		1
+#	define R300_TX_FORMAT_Z		2
+#	define R300_TX_FORMAT_W		3
+#	define R300_TX_FORMAT_ZERO	4
+#	define R300_TX_FORMAT_ONE	5
+	/* 2.0*Z, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_Z	6
+	/* 2.0*W, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_W	7
+
+#	define R300_TX_FORMAT_B_SHIFT	18
+#	define R300_TX_FORMAT_G_SHIFT	15
+#	define R300_TX_FORMAT_R_SHIFT	12
+#	define R300_TX_FORMAT_A_SHIFT	9
+	/* Convenience macro to take care of layout and swizzling */
+#	define R300_EASY_TX_FORMAT(B, G, R, A, FMT)	(		\
+		((R300_TX_FORMAT_##B)<<R300_TX_FORMAT_B_SHIFT)		\
+		| ((R300_TX_FORMAT_##G)<<R300_TX_FORMAT_G_SHIFT)	\
+		| ((R300_TX_FORMAT_##R)<<R300_TX_FORMAT_R_SHIFT)	\
+		| ((R300_TX_FORMAT_##A)<<R300_TX_FORMAT_A_SHIFT)	\
+		| (R300_TX_FORMAT_##FMT)				\
+		)
+	/* These can be ORed with result of R300_EASY_TX_FORMAT()
+	   We don't really know what they do. Take values from a
+           constant color ? */
+#	define R300_TX_FORMAT_CONST_X		(1<<5)
+#	define R300_TX_FORMAT_CONST_Y		(2<<5)
+#	define R300_TX_FORMAT_CONST_Z		(4<<5)
+#	define R300_TX_FORMAT_CONST_W		(8<<5)
+
+#	define R300_TX_FORMAT_YUV_MODE		0x00800000
+
+#define R300_TX_FORMAT2_0		    0x4500 /* obvious missing in gap */
+#       define R300_TX_PITCHMASK_SHIFT           0
+#       define R300_TX_PITCHMASK_MASK            (2047 << 0)
+#	define R500_TXFORMAT_MSB		 (1 << 14)
+#	define R500_TXWIDTH_BIT11	         (1 << 15)
+#	define R500_TXHEIGHT_BIT11	         (1 << 16)
+#	define R500_POW2FIX2FLT			 (1 << 17)
+#	define R500_SEL_FILTER4_TC0		 (0 << 18)
+#	define R500_SEL_FILTER4_TC1		 (1 << 18)
+#	define R500_SEL_FILTER4_TC2		 (2 << 18)
+#	define R500_SEL_FILTER4_TC3		 (3 << 18)
+
+#define R300_TX_OFFSET_0                    0x4540
+#define R300_TX_OFFSET_1                    0x4544
+#define R300_TX_OFFSET_2                    0x4548
+#define R300_TX_OFFSET_3                    0x454C
+#define R300_TX_OFFSET_4                    0x4550
+#define R300_TX_OFFSET_5                    0x4554
+#define R300_TX_OFFSET_6                    0x4558
+#define R300_TX_OFFSET_7                    0x455C
+	/* BEGIN: Guess from R200 */
+#       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
+#       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
+#       define R300_TXO_ENDIAN_WORD_SWAP         (2 << 0)
+#       define R300_TXO_ENDIAN_HALFDW_SWAP       (3 << 0)
+#       define R300_TXO_MACRO_TILE               (1 << 2)
+#       define R300_TXO_MICRO_TILE_LINEAR        (0 << 3)
+#       define R300_TXO_MICRO_TILE               (1 << 3)
+#       define R300_TXO_MICRO_TILE_SQUARE        (2 << 3)
+#       define R300_TXO_OFFSET_MASK              0xffffffe0
+#       define R300_TXO_OFFSET_SHIFT             5
+	/* END: Guess from R200 */
+
+/* 32 bit chroma key */
+#define R300_TX_CHROMA_KEY_0                      0x4580
+#define R300_TX_CHROMA_KEY_1                      0x4584
+#define R300_TX_CHROMA_KEY_2                      0x4588
+#define R300_TX_CHROMA_KEY_3                      0x458c
+#define R300_TX_CHROMA_KEY_4                      0x4590
+#define R300_TX_CHROMA_KEY_5                      0x4594
+#define R300_TX_CHROMA_KEY_6                      0x4598
+#define R300_TX_CHROMA_KEY_7                      0x459c
+#define R300_TX_CHROMA_KEY_8                      0x45a0
+#define R300_TX_CHROMA_KEY_9                      0x45a4
+#define R300_TX_CHROMA_KEY_10                     0x45a8
+#define R300_TX_CHROMA_KEY_11                     0x45ac
+#define R300_TX_CHROMA_KEY_12                     0x45b0
+#define R300_TX_CHROMA_KEY_13                     0x45b4
+#define R300_TX_CHROMA_KEY_14                     0x45b8
+#define R300_TX_CHROMA_KEY_15                     0x45bc
+/* ff00ff00 == { 0, 1.0, 0, 1.0 } */
+
+/* Border Color */
+#define R300_TX_BORDER_COLOR_0              0x45c0
+#define R300_TX_BORDER_COLOR_1              0x45c4
+#define R300_TX_BORDER_COLOR_2              0x45c8
+#define R300_TX_BORDER_COLOR_3              0x45cc
+#define R300_TX_BORDER_COLOR_4              0x45d0
+#define R300_TX_BORDER_COLOR_5              0x45d4
+#define R300_TX_BORDER_COLOR_6              0x45d8
+#define R300_TX_BORDER_COLOR_7              0x45dc
+#define R300_TX_BORDER_COLOR_8              0x45e0
+#define R300_TX_BORDER_COLOR_9              0x45e4
+#define R300_TX_BORDER_COLOR_10             0x45e8
+#define R300_TX_BORDER_COLOR_11             0x45ec
+#define R300_TX_BORDER_COLOR_12             0x45f0
+#define R300_TX_BORDER_COLOR_13             0x45f4
+#define R300_TX_BORDER_COLOR_14             0x45f8
+#define R300_TX_BORDER_COLOR_15             0x45fc
+
+
+/* END: Texture specification */
+
+/* BEGIN: Fragment program instruction set */
+
+/* Fragment programs are written directly into register space.
+ * There are separate instruction streams for texture instructions and ALU
+ * instructions.
+ * In order to synchronize these streams, the program is divided into up
+ * to 4 nodes. Each node begins with a number of TEX operations, followed
+ * by a number of ALU operations.
+ * The first node can have zero TEX ops, all subsequent nodes must have at
+ * least
+ * one TEX ops.
+ * All nodes must have at least one ALU op.
+ *
+ * The index of the last node is stored in PFS_CNTL_0: A value of 0 means
+ * 1 node, a value of 3 means 4 nodes.
+ * The total amount of instructions is defined in PFS_CNTL_2. The offsets are
+ * offsets into the respective instruction streams, while *_END points to the
+ * last instruction relative to this offset.
+ */
+#define R300_US_CONFIG                      0x4600
+#       define R300_PFS_CNTL_LAST_NODES_SHIFT    0
+#       define R300_PFS_CNTL_LAST_NODES_MASK     (3 << 0)
+#       define R300_PFS_CNTL_FIRST_NODE_HAS_TEX  (1 << 3)
+#define R300_US_PIXSIZE                     0x4604
+/* There is an unshifted value here which has so far always been equal to the
+ * index of the highest used temporary register.
+ */
+#define R300_US_CODE_OFFSET                 0x4608
+#       define R300_PFS_CNTL_ALU_OFFSET_SHIFT    0
+#       define R300_PFS_CNTL_ALU_OFFSET_MASK     (63 << 0)
+#       define R300_PFS_CNTL_ALU_END_SHIFT       6
+#       define R300_PFS_CNTL_ALU_END_MASK        (63 << 6)
+#       define R300_PFS_CNTL_TEX_OFFSET_SHIFT    13
+#       define R300_PFS_CNTL_TEX_OFFSET_MASK     (31 << 13)
+#       define R300_PFS_CNTL_TEX_END_SHIFT       18
+#       define R300_PFS_CNTL_TEX_END_MASK        (31 << 18)
+
+/* gap */
+
+/* Nodes are stored backwards. The last active node is always stored in
+ * PFS_NODE_3.
+ * Example: In a 2-node program, NODE_0 and NODE_1 are set to 0. The
+ * first node is stored in NODE_2, the second node is stored in NODE_3.
+ *
+ * Offsets are relative to the master offset from PFS_CNTL_2.
+ */
+#define R300_US_CODE_ADDR_0                 0x4610
+#define R300_US_CODE_ADDR_1                 0x4614
+#define R300_US_CODE_ADDR_2                 0x4618
+#define R300_US_CODE_ADDR_3                 0x461C
+#       define R300_ALU_START_SHIFT         0
+#       define R300_ALU_START_MASK          (63 << 0)
+#       define R300_ALU_SIZE_SHIFT          6
+#       define R300_ALU_SIZE_MASK           (63 << 6)
+#       define R300_TEX_START_SHIFT         12
+#       define R300_TEX_START_MASK          (31 << 12)
+#       define R300_TEX_SIZE_SHIFT          17
+#       define R300_TEX_SIZE_MASK           (31 << 17)
+#	define R300_RGBA_OUT                (1 << 22)
+#	define R300_W_OUT                   (1 << 23)
+
+/* TEX
+ * As far as I can tell, texture instructions cannot write into output
+ * registers directly. A subsequent ALU instruction is always necessary,
+ * even if it's just MAD o0, r0, 1, 0
+ */
+#define R300_US_TEX_INST_0                  0x4620
+#	define R300_SRC_ADDR_SHIFT          0
+#	define R300_SRC_ADDR_MASK           (31 << 0)
+#	define R300_DST_ADDR_SHIFT          6
+#	define R300_DST_ADDR_MASK           (31 << 6)
+#	define R300_TEX_ID_SHIFT            11
+#       define R300_TEX_ID_MASK             (15 << 11)
+#	define R300_TEX_INST_SHIFT		15
+#		define R300_TEX_OP_NOP	        0
+#		define R300_TEX_OP_LD	        1
+#		define R300_TEX_OP_KIL	        2
+#		define R300_TEX_OP_TXP	        3
+#		define R300_TEX_OP_TXB	        4
+#	define R300_TEX_INST_MASK               (7 << 15)
+
+/* Output format from the unfied shader */
+#define R300_US_OUT_FMT_0                   0x46A4
+#	define R300_US_OUT_FMT_C4_8         (0 << 0)
+#	define R300_US_OUT_FMT_C4_10        (1 << 0)
+#	define R300_US_OUT_FMT_C4_10_GAMMA  (2 << 0)
+#	define R300_US_OUT_FMT_C_16         (3 << 0)
+#	define R300_US_OUT_FMT_C2_16        (4 << 0)
+#	define R300_US_OUT_FMT_C4_16        (5 << 0)
+#	define R300_US_OUT_FMT_C_16_MPEG    (6 << 0)
+#	define R300_US_OUT_FMT_C2_16_MPEG   (7 << 0)
+#	define R300_US_OUT_FMT_C2_4         (8 << 0)
+#	define R300_US_OUT_FMT_C_3_3_2      (9 << 0)
+#	define R300_US_OUT_FMT_C_6_5_6      (10 << 0)
+#	define R300_US_OUT_FMT_C_11_11_10   (11 << 0)
+#	define R300_US_OUT_FMT_C_10_11_11   (12 << 0)
+#	define R300_US_OUT_FMT_C_2_10_10_10 (13 << 0)
+/* reserved */
+#	define R300_US_OUT_FMT_UNUSED       (15 << 0)
+#	define R300_US_OUT_FMT_C_16_FP      (16 << 0)
+#	define R300_US_OUT_FMT_C2_16_FP     (17 << 0)
+#	define R300_US_OUT_FMT_C4_16_FP     (18 << 0)
+#	define R300_US_OUT_FMT_C_32_FP      (19 << 0)
+#	define R300_US_OUT_FMT_C2_32_FP     (20 << 0)
+#	define R300_US_OUT_FMT_C4_32_FP     (21 << 0)
+#   define R300_C0_SEL_A				(0 << 8)
+#   define R300_C0_SEL_R				(1 << 8)
+#   define R300_C0_SEL_G				(2 << 8)
+#   define R300_C0_SEL_B				(3 << 8)
+#   define R300_C1_SEL_A				(0 << 10)
+#   define R300_C1_SEL_R				(1 << 10)
+#   define R300_C1_SEL_G				(2 << 10)
+#   define R300_C1_SEL_B				(3 << 10)
+#   define R300_C2_SEL_A				(0 << 12)
+#   define R300_C2_SEL_R				(1 << 12)
+#   define R300_C2_SEL_G				(2 << 12)
+#   define R300_C2_SEL_B				(3 << 12)
+#   define R300_C3_SEL_A				(0 << 14)
+#   define R300_C3_SEL_R				(1 << 14)
+#   define R300_C3_SEL_G				(2 << 14)
+#   define R300_C3_SEL_B				(3 << 14)
+#   define R300_OUT_SIGN(x)				(x << 16)
+
+/* ALU
+ * The ALU instructions register blocks are enumerated according to the order
+ * in which fglrx. I assume there is space for 64 instructions, since
+ * each block has space for a maximum of 64 DWORDs, and this matches reported
+ * native limits.
+ *
+ * The basic functional block seems to be one MAD for each color and alpha,
+ * and an adder that adds all components after the MUL.
+ *  - ADD, MUL, MAD etc.: use MAD with appropriate neutral operands
+ *  - DP4: Use OUTC_DP4, OUTA_DP4
+ *  - DP3: Use OUTC_DP3, OUTA_DP4, appropriate alpha operands
+ *  - DPH: Use OUTC_DP4, OUTA_DP4, appropriate alpha operands
+ *  - CMPH: If ARG2 > 0.5, return ARG0, else return ARG1
+ *  - CMP: If ARG2 < 0, return ARG1, else return ARG0
+ *  - FLR: use FRC+MAD
+ *  - XPD: use MAD+MAD
+ *  - SGE, SLT: use MAD+CMP
+ *  - RSQ: use ABS modifier for argument
+ *  - Use OUTC_REPL_ALPHA to write results of an alpha-only operation
+ *    (e.g. RCP) into color register
+ *  - apparently, there's no quick DST operation
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAD fragment.color, tmp0, tmp1, tmp2"
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAX r2, r1, c0"
+ *  - fglrx once set FPI0_UNKNOWN_31 on a "FRC r1, r1"
+ *
+ * Operand selection
+ * First stage selects three sources from the available registers and
+ * constant parameters. This is defined in INSTR1 (color) and INSTR3 (alpha).
+ * fglrx sorts the three source fields: Registers before constants,
+ * lower indices before higher indices; I do not know whether this is
+ * necessary.
+ *
+ * fglrx fills unused sources with "read constant 0"
+ * According to specs, you cannot select more than two different constants.
+ *
+ * Second stage selects the operands from the sources. This is defined in
+ * INSTR0 (color) and INSTR2 (alpha). You can also select the special constants
+ * zero and one.
+ * Swizzling and negation happens in this stage, as well.
+ *
+ * Important: Color and alpha seem to be mostly separate, i.e. their sources
+ * selection appears to be fully independent (the register storage is probably
+ * physically split into a color and an alpha section).
+ * However (because of the apparent physical split), there is some interaction
+ * WRT swizzling. If, for example, you want to load an R component into an
+ * Alpha operand, this R component is taken from a *color* source, not from
+ * an alpha source. The corresponding register doesn't even have to appear in
+ * the alpha sources list. (I hope this all makes sense to you)
+ *
+ * Destination selection
+ * The destination register index is in FPI1 (color) and FPI3 (alpha)
+ * together with enable bits.
+ * There are separate enable bits for writing into temporary registers
+ * (DSTC_REG_* /DSTA_REG) and and program output registers (DSTC_OUTPUT_*
+ * /DSTA_OUTPUT). You can write to both at once, or not write at all (the
+ * same index must be used for both).
+ *
+ * Note: There is a special form for LRP
+ *  - Argument order is the same as in ARB_fragment_program.
+ *  - Operation is MAD
+ *  - ARG1 is set to ARGC_SRC1C_LRP/ARGC_SRC1A_LRP
+ *  - Set FPI0/FPI2_SPECIAL_LRP
+ * Arbitrary LRP (including support for swizzling) requires vanilla MAD+MAD
+ */
+#define R300_US_ALU_RGB_ADDR_0                   0x46C0
+#       define R300_ALU_SRC0C_SHIFT             0
+#       define R300_ALU_SRC0C_MASK              (31 << 0)
+#       define R300_ALU_SRC0C_CONST             (1 << 5)
+#       define R300_ALU_SRC1C_SHIFT             6
+#       define R300_ALU_SRC1C_MASK              (31 << 6)
+#       define R300_ALU_SRC1C_CONST             (1 << 11)
+#       define R300_ALU_SRC2C_SHIFT             12
+#       define R300_ALU_SRC2C_MASK              (31 << 12)
+#       define R300_ALU_SRC2C_CONST             (1 << 17)
+#       define R300_ALU_SRC_MASK                0x0003ffff
+#       define R300_ALU_DSTC_SHIFT              18
+#       define R300_ALU_DSTC_MASK               (31 << 18)
+#		define R300_ALU_DSTC_REG_MASK_SHIFT     23
+#       define R300_ALU_DSTC_REG_X              (1 << 23)
+#       define R300_ALU_DSTC_REG_Y              (1 << 24)
+#       define R300_ALU_DSTC_REG_Z              (1 << 25)
+#		define R300_ALU_DSTC_OUTPUT_MASK_SHIFT  26
+#       define R300_ALU_DSTC_OUTPUT_X           (1 << 26)
+#       define R300_ALU_DSTC_OUTPUT_Y           (1 << 27)
+#       define R300_ALU_DSTC_OUTPUT_Z           (1 << 28)
+
+#define R300_US_ALU_ALPHA_ADDR_0                 0x47C0
+#       define R300_ALU_SRC0A_SHIFT             0
+#       define R300_ALU_SRC0A_MASK              (31 << 0)
+#       define R300_ALU_SRC0A_CONST             (1 << 5)
+#       define R300_ALU_SRC1A_SHIFT             6
+#       define R300_ALU_SRC1A_MASK              (31 << 6)
+#       define R300_ALU_SRC1A_CONST             (1 << 11)
+#       define R300_ALU_SRC2A_SHIFT             12
+#       define R300_ALU_SRC2A_MASK              (31 << 12)
+#       define R300_ALU_SRC2A_CONST             (1 << 17)
+#       define R300_ALU_SRC_MASK                0x0003ffff
+#       define R300_ALU_DSTA_SHIFT              18
+#       define R300_ALU_DSTA_MASK               (31 << 18)
+#       define R300_ALU_DSTA_REG                (1 << 23)
+#       define R300_ALU_DSTA_OUTPUT             (1 << 24)
+#		define R300_ALU_DSTA_DEPTH              (1 << 27)
+
+#define R300_US_ALU_RGB_INST_0                   0x48C0
+#       define R300_ALU_ARGC_SRC0C_XYZ          0
+#       define R300_ALU_ARGC_SRC0C_XXX          1
+#       define R300_ALU_ARGC_SRC0C_YYY          2
+#       define R300_ALU_ARGC_SRC0C_ZZZ          3
+#       define R300_ALU_ARGC_SRC1C_XYZ          4
+#       define R300_ALU_ARGC_SRC1C_XXX          5
+#       define R300_ALU_ARGC_SRC1C_YYY          6
+#       define R300_ALU_ARGC_SRC1C_ZZZ          7
+#       define R300_ALU_ARGC_SRC2C_XYZ          8
+#       define R300_ALU_ARGC_SRC2C_XXX          9
+#       define R300_ALU_ARGC_SRC2C_YYY          10
+#       define R300_ALU_ARGC_SRC2C_ZZZ          11
+#       define R300_ALU_ARGC_SRC0A              12
+#       define R300_ALU_ARGC_SRC1A              13
+#       define R300_ALU_ARGC_SRC2A              14
+#       define R300_ALU_ARGC_SRCP_XYZ           15
+#       define R300_ALU_ARGC_SRCP_XXX           16
+#       define R300_ALU_ARGC_SRCP_YYY           17
+#       define R300_ALU_ARGC_SRCP_ZZZ           18
+#       define R300_ALU_ARGC_SRCP_WWW           19
+#       define R300_ALU_ARGC_ZERO               20
+#       define R300_ALU_ARGC_ONE                21
+#       define R300_ALU_ARGC_HALF               22
+#       define R300_ALU_ARGC_SRC0C_YZX          23
+#       define R300_ALU_ARGC_SRC1C_YZX          24
+#       define R300_ALU_ARGC_SRC2C_YZX          25
+#       define R300_ALU_ARGC_SRC0C_ZXY          26
+#       define R300_ALU_ARGC_SRC1C_ZXY          27
+#       define R300_ALU_ARGC_SRC2C_ZXY          28
+#       define R300_ALU_ARGC_SRC0CA_WZY         29
+#       define R300_ALU_ARGC_SRC1CA_WZY         30
+#       define R300_ALU_ARGC_SRC2CA_WZY         31
+
+#       define R300_ALU_ARG0C_SHIFT             0
+#       define R300_ALU_ARG0C_MASK              (31 << 0)
+#       define R300_ALU_ARG0C_NOP               (0 << 5)
+#       define R300_ALU_ARG0C_NEG               (1 << 5)
+#       define R300_ALU_ARG0C_ABS               (2 << 5)
+#       define R300_ALU_ARG0C_NAB               (3 << 5)
+#       define R300_ALU_ARG1C_SHIFT             7
+#       define R300_ALU_ARG1C_MASK              (31 << 7)
+#       define R300_ALU_ARG1C_NOP               (0 << 12)
+#       define R300_ALU_ARG1C_NEG               (1 << 12)
+#       define R300_ALU_ARG1C_ABS               (2 << 12)
+#       define R300_ALU_ARG1C_NAB               (3 << 12)
+#       define R300_ALU_ARG2C_SHIFT             14
+#       define R300_ALU_ARG2C_MASK              (31 << 14)
+#       define R300_ALU_ARG2C_NOP               (0 << 19)
+#       define R300_ALU_ARG2C_NEG               (1 << 19)
+#       define R300_ALU_ARG2C_ABS               (2 << 19)
+#       define R300_ALU_ARG2C_NAB               (3 << 19)
+#       define R300_ALU_SRCP_1_MINUS_2_SRC0     (0 << 21)
+#       define R300_ALU_SRCP_SRC1_MINUS_SRC0    (1 << 21)
+#       define R300_ALU_SRCP_SRC1_PLUS_SRC0     (2 << 21)
+#       define R300_ALU_SRCP_1_MINUS_SRC0       (3 << 21)
+
+#       define R300_ALU_OUTC_MAD                (0 << 23)
+#       define R300_ALU_OUTC_DP3                (1 << 23)
+#       define R300_ALU_OUTC_DP4                (2 << 23)
+#       define R300_ALU_OUTC_D2A                (3 << 23)
+#       define R300_ALU_OUTC_MIN                (4 << 23)
+#       define R300_ALU_OUTC_MAX                (5 << 23)
+#       define R300_ALU_OUTC_CMPH               (7 << 23)
+#       define R300_ALU_OUTC_CMP                (8 << 23)
+#       define R300_ALU_OUTC_FRC                (9 << 23)
+#       define R300_ALU_OUTC_REPL_ALPHA         (10 << 23)
+
+#       define R300_ALU_OUTC_MOD_NOP            (0 << 27)
+#       define R300_ALU_OUTC_MOD_MUL2           (1 << 27)
+#       define R300_ALU_OUTC_MOD_MUL4           (2 << 27)
+#       define R300_ALU_OUTC_MOD_MUL8           (3 << 27)
+#       define R300_ALU_OUTC_MOD_DIV2           (4 << 27)
+#       define R300_ALU_OUTC_MOD_DIV4           (5 << 27)
+#       define R300_ALU_OUTC_MOD_DIV8           (6 << 27)
+
+#       define R300_ALU_OUTC_CLAMP              (1 << 30)
+#       define R300_ALU_INSERT_NOP              (1 << 31)
+
+#define R300_US_ALU_ALPHA_INST_0                 0x49C0
+#       define R300_ALU_ARGA_SRC0C_X            0
+#       define R300_ALU_ARGA_SRC0C_Y            1
+#       define R300_ALU_ARGA_SRC0C_Z            2
+#       define R300_ALU_ARGA_SRC1C_X            3
+#       define R300_ALU_ARGA_SRC1C_Y            4
+#       define R300_ALU_ARGA_SRC1C_Z            5
+#       define R300_ALU_ARGA_SRC2C_X            6
+#       define R300_ALU_ARGA_SRC2C_Y            7
+#       define R300_ALU_ARGA_SRC2C_Z            8
+#       define R300_ALU_ARGA_SRC0A              9
+#       define R300_ALU_ARGA_SRC1A              10
+#       define R300_ALU_ARGA_SRC2A              11
+#       define R300_ALU_ARGA_SRCP_X             12
+#       define R300_ALU_ARGA_SRCP_Y             13
+#       define R300_ALU_ARGA_SRCP_Z             14
+#       define R300_ALU_ARGA_SRCP_W             15
+
+#       define R300_ALU_ARGA_ZERO               16
+#       define R300_ALU_ARGA_ONE                17
+#       define R300_ALU_ARGA_HALF               18
+#       define R300_ALU_ARG0A_SHIFT             0
+#       define R300_ALU_ARG0A_MASK              (31 << 0)
+#       define R300_ALU_ARG0A_NOP               (0 << 5)
+#       define R300_ALU_ARG0A_NEG               (1 << 5)
+#	define R300_ALU_ARG0A_ABS		 (2 << 5)
+#	define R300_ALU_ARG0A_NAB		 (3 << 5)
+#       define R300_ALU_ARG1A_SHIFT             7
+#       define R300_ALU_ARG1A_MASK              (31 << 7)
+#       define R300_ALU_ARG1A_NOP               (0 << 12)
+#       define R300_ALU_ARG1A_NEG               (1 << 12)
+#	define R300_ALU_ARG1A_ABS		 (2 << 12)
+#	define R300_ALU_ARG1A_NAB		 (3 << 12)
+#       define R300_ALU_ARG2A_SHIFT             14
+#       define R300_ALU_ARG2A_MASK              (31 << 14)
+#       define R300_ALU_ARG2A_NOP               (0 << 19)
+#       define R300_ALU_ARG2A_NEG               (1 << 19)
+#	define R300_ALU_ARG2A_ABS		 (2 << 19)
+#	define R300_ALU_ARG2A_NAB		 (3 << 19)
+#       define R300_ALU_SRCP_1_MINUS_2_SRC0     (0 << 21)
+#       define R300_ALU_SRCP_SRC1_MINUS_SRC0    (1 << 21)
+#       define R300_ALU_SRCP_SRC1_PLUS_SRC0     (2 << 21)
+#       define R300_ALU_SRCP_1_MINUS_SRC0       (3 << 21)
+
+#       define R300_ALU_OUTA_MAD                (0 << 23)
+#       define R300_ALU_OUTA_DP4                (1 << 23)
+#       define R300_ALU_OUTA_MIN                (2 << 23)
+#       define R300_ALU_OUTA_MAX                (3 << 23)
+#       define R300_ALU_OUTA_CND                (5 << 23)
+#       define R300_ALU_OUTA_CMP                (6 << 23)
+#       define R300_ALU_OUTA_FRC                (7 << 23)
+#       define R300_ALU_OUTA_EX2                (8 << 23)
+#       define R300_ALU_OUTA_LG2                (9 << 23)
+#       define R300_ALU_OUTA_RCP                (10 << 23)
+#       define R300_ALU_OUTA_RSQ                (11 << 23)
+
+#       define R300_ALU_OUTA_MOD_NOP            (0 << 27)
+#       define R300_ALU_OUTA_MOD_MUL2           (1 << 27)
+#       define R300_ALU_OUTA_MOD_MUL4           (2 << 27)
+#       define R300_ALU_OUTA_MOD_MUL8           (3 << 27)
+#       define R300_ALU_OUTA_MOD_DIV2           (4 << 27)
+#       define R300_ALU_OUTA_MOD_DIV4           (5 << 27)
+#       define R300_ALU_OUTA_MOD_DIV8           (6 << 27)
+
+#       define R300_ALU_OUTA_CLAMP              (1 << 30)
+/* END: Fragment program instruction set */
+
+/* Fog: Fog Blending Enable */
+#define R300_FG_FOG_BLEND                             0x4bc0
+#       define R300_FG_FOG_BLEND_DISABLE              (0 << 0)
+#       define R300_FG_FOG_BLEND_ENABLE               (1 << 0)
+#	define R300_FG_FOG_BLEND_FN_LINEAR            (0 << 1)
+#	define R300_FG_FOG_BLEND_FN_EXP               (1 << 1)
+#	define R300_FG_FOG_BLEND_FN_EXP2              (2 << 1)
+#	define R300_FG_FOG_BLEND_FN_CONSTANT          (3 << 1)
+#	define R300_FG_FOG_BLEND_FN_MASK              (3 << 1)
+
+/* Fog: Red Component of Fog Color */
+#define R300_FG_FOG_COLOR_R                           0x4bc8
+/* Fog: Green Component of Fog Color */
+#define R300_FG_FOG_COLOR_G                           0x4bcc
+/* Fog: Blue Component of Fog Color */
+#define R300_FG_FOG_COLOR_B                           0x4bd0
+#	define R300_FG_FOG_COLOR_MASK 0x000003ff
+
+/* Fog: Constant Factor for Fog Blending */
+#define R300_FG_FOG_FACTOR                            0x4bc4
+#	define FG_FOG_FACTOR_MASK 0x000003ff
+
+/* Fog: Alpha function */
+#define R300_FG_ALPHA_FUNC                            0x4bd4
+#       define R300_FG_ALPHA_FUNC_VAL_MASK               0x000000ff
+#       define R300_FG_ALPHA_FUNC_NEVER                     (0 << 8)
+#       define R300_FG_ALPHA_FUNC_LESS                      (1 << 8)
+#       define R300_FG_ALPHA_FUNC_EQUAL                     (2 << 8)
+#       define R300_FG_ALPHA_FUNC_LE                        (3 << 8)
+#       define R300_FG_ALPHA_FUNC_GREATER                   (4 << 8)
+#       define R300_FG_ALPHA_FUNC_NOTEQUAL                  (5 << 8)
+#       define R300_FG_ALPHA_FUNC_GE                        (6 << 8)
+#       define R300_FG_ALPHA_FUNC_ALWAYS                    (7 << 8)
+#       define R300_ALPHA_TEST_OP_MASK                      (7 << 8)
+#       define R300_FG_ALPHA_FUNC_DISABLE                   (0 << 11)
+#       define R300_FG_ALPHA_FUNC_ENABLE                    (1 << 11)
+
+#       define R500_FG_ALPHA_FUNC_10BIT                     (0 << 12)
+#       define R500_FG_ALPHA_FUNC_8BIT                      (1 << 12)
+
+#       define R300_FG_ALPHA_FUNC_MASK_DISABLE              (0 << 16)
+#       define R300_FG_ALPHA_FUNC_MASK_ENABLE               (1 << 16)
+#       define R300_FG_ALPHA_FUNC_CFG_2_OF_4                (0 << 17)
+#       define R300_FG_ALPHA_FUNC_CFG_3_OF_6                (1 << 17)
+
+#       define R300_FG_ALPHA_FUNC_DITH_DISABLE              (0 << 20)
+#       define R300_FG_ALPHA_FUNC_DITH_ENABLE               (1 << 20)
+
+#       define R500_FG_ALPHA_FUNC_OFFSET_DISABLE            (0 << 24)
+#       define R500_FG_ALPHA_FUNC_OFFSET_ENABLE             (1 << 24) /* Not supported in R520 */
+#       define R500_FG_ALPHA_FUNC_DISC_ZERO_MASK_DISABLE    (0 << 25)
+#       define R500_FG_ALPHA_FUNC_DISC_ZERO_MASK_ENABLE     (1 << 25)
+
+#       define R500_FG_ALPHA_FUNC_FP16_DISABLE              (0 << 28)
+#       define R500_FG_ALPHA_FUNC_FP16_ENABLE               (1 << 28)
+
+
+/* Fog: Where does the depth come from? */
+#define R300_FG_DEPTH_SRC                  0x4bd8
+#	define R300_FG_DEPTH_SRC_SCAN   (0 << 0)
+#	define R300_FG_DEPTH_SRC_SHADER (1 << 0)
+
+/* Fog: Alpha Compare Value */
+#define R500_FG_ALPHA_VALUE                0x4be0
+#	define R500_FG_ALPHA_VALUE_MASK 0x0000ffff
+
+/* gap */
+
+/* Fragment program parameters in 7.16 floating point */
+#define R300_PFS_PARAM_0_X                  0x4C00
+#define R300_PFS_PARAM_0_Y                  0x4C04
+#define R300_PFS_PARAM_0_Z                  0x4C08
+#define R300_PFS_PARAM_0_W                  0x4C0C
+/* last consts */
+#define R300_PFS_PARAM_31_X                 0x4DF0
+#define R300_PFS_PARAM_31_Y                 0x4DF4
+#define R300_PFS_PARAM_31_Z                 0x4DF8
+#define R300_PFS_PARAM_31_W                 0x4DFC
+
+/* Unpipelined. */
+#define R300_RB3D_CCTL                      0x4e00
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_1_BUFFER                (0 << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_2_BUFFERS               (1 << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_3_BUFFERS               (2 << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_4_BUFFERS               (3 << 5)
+#	define R300_RB3D_CCTL_CLRCMP_FLIPE_DISABLE                    (0 << 7)
+#	define R300_RB3D_CCTL_CLRCMP_FLIPE_ENABLE                     (1 << 7)
+#	define R300_RB3D_CCTL_AA_COMPRESSION_DISABLE                  (0 << 9)
+#	define R300_RB3D_CCTL_AA_COMPRESSION_ENABLE                   (1 << 9)
+#	define R300_RB3D_CCTL_CMASK_DISABLE                           (0 << 10)
+#	define R300_RB3D_CCTL_CMASK_ENABLE                            (1 << 10)
+/* reserved */
+#	define R300_RB3D_CCTL_INDEPENDENT_COLOR_CHANNEL_MASK_DISABLE  (0 << 12)
+#	define R300_RB3D_CCTL_INDEPENDENT_COLOR_CHANNEL_MASK_ENABLE   (1 << 12)
+#	define R300_RB3D_CCTL_WRITE_COMPRESSION_ENABLE                (0 << 13)
+#	define R300_RB3D_CCTL_WRITE_COMPRESSION_DISABLE               (1 << 13)
+#	define R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_DISABLE  (0 << 14)
+#	define R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_ENABLE   (1 << 14)
+
+
+/* Notes:
+ * - AFAIK fglrx always sets BLEND_UNKNOWN when blending is used in
+ *   the application
+ * - AFAIK fglrx always sets BLEND_NO_SEPARATE when CBLEND and ABLEND
+ *    are set to the same
+ *   function (both registers are always set up completely in any case)
+ * - Most blend flags are simply copied from R200 and not tested yet
+ */
+#define R300_RB3D_CBLEND                    0x4E04
+#define R300_RB3D_ABLEND                    0x4E08
+/* the following only appear in CBLEND */
+#       define R300_ALPHA_BLEND_ENABLE         (1 << 0)
+#       define R300_SEPARATE_ALPHA_ENABLE      (1 << 1)
+#       define R300_READ_ENABLE                (1 << 2)
+#       define R300_DISCARD_SRC_PIXELS_DIS     (0 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0     (1 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_0     (2 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_0     (3 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1     (4 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_1     (5 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1     (6 << 3)
+
+/* the following are shared between CBLEND and ABLEND */
+#       define R300_FCN_MASK                         (3  << 12)
+#       define R300_COMB_FCN_ADD_CLAMP               (0  << 12)
+#       define R300_COMB_FCN_ADD_NOCLAMP             (1  << 12)
+#       define R300_COMB_FCN_SUB_CLAMP               (2  << 12)
+#       define R300_COMB_FCN_SUB_NOCLAMP             (3  << 12)
+#       define R300_COMB_FCN_MIN                     (4  << 12)
+#       define R300_COMB_FCN_MAX                     (5  << 12)
+#       define R300_COMB_FCN_RSUB_CLAMP              (6  << 12)
+#       define R300_COMB_FCN_RSUB_NOCLAMP            (7  << 12)
+#       define R300_BLEND_GL_ZERO                    (32)
+#       define R300_BLEND_GL_ONE                     (33)
+#       define R300_BLEND_GL_SRC_COLOR               (34)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_COLOR     (35)
+#       define R300_BLEND_GL_DST_COLOR               (36)
+#       define R300_BLEND_GL_ONE_MINUS_DST_COLOR     (37)
+#       define R300_BLEND_GL_SRC_ALPHA               (38)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_ALPHA     (39)
+#       define R300_BLEND_GL_DST_ALPHA               (40)
+#       define R300_BLEND_GL_ONE_MINUS_DST_ALPHA     (41)
+#       define R300_BLEND_GL_SRC_ALPHA_SATURATE      (42)
+#       define R300_BLEND_GL_CONST_COLOR             (43)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_COLOR   (44)
+#       define R300_BLEND_GL_CONST_ALPHA             (45)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_ALPHA   (46)
+#       define R300_BLEND_MASK                       (63)
+#       define R300_SRC_BLEND_SHIFT                  (16)
+#       define R300_DST_BLEND_SHIFT                  (24)
+
+/* Constant color used by the blender. Pipelined through the blender.
+ * Note: For R520, this field is ignored, use RB3D_CONSTANT_COLOR_GB__BLUE,
+ * RB3D_CONSTANT_COLOR_GB__GREEN, etc. instead.
+ */
+#define R300_RB3D_BLEND_COLOR               0x4E10
+
+
+/* 3D Color Channel Mask. If all the channels used in the current color format
+ * are disabled, then the cb will discard all the incoming quads. Pipelined
+ * through the blender.
+ */
+#define RB3D_COLOR_CHANNEL_MASK                  0x4E0C
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0  (1 << 0)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 (1 << 1)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK0   (1 << 2)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 (1 << 3)
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK1  (1 << 4)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK1 (1 << 5)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK1   (1 << 6)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK1 (1 << 7)
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK2  (1 << 8)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK2 (1 << 9)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK2   (1 << 10)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK2 (1 << 11)
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK3  (1 << 12)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK3 (1 << 13)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK3   (1 << 14)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK3 (1 << 15)
+
+/* Clear color that is used when the color mask is set to 00. Unpipelined.
+ * Program this register with a 32-bit value in ARGB8888 or ARGB2101010
+ * formats, ignoring the fields.
+ */
+#define RB3D_COLOR_CLEAR_VALUE                   0x4e14
+
+/* gap */
+
+/* Color Compare Color. Stalls the 2d/3d datapath until it is idle. */
+#define RB3D_CLRCMP_CLR                     0x4e20
+
+/* Color Compare Mask. Stalls the 2d/3d datapath until it is idle. */
+#define RB3D_CLRCMP_MSK                     0x4e24
+
+/* Color Buffer Address Offset of multibuffer 0. Unpipelined. */
+#define R300_RB3D_COLOROFFSET0              0x4E28
+#       define R300_COLOROFFSET_MASK             0xFFFFFFE0
+/* Color Buffer Address Offset of multibuffer 1. Unpipelined. */
+#define R300_RB3D_COLOROFFSET1              0x4E2C
+/* Color Buffer Address Offset of multibuffer 2. Unpipelined. */
+#define R300_RB3D_COLOROFFSET2              0x4E30
+/* Color Buffer Address Offset of multibuffer 3. Unpipelined. */
+#define R300_RB3D_COLOROFFSET3              0x4E34
+
+/* Color buffer format and tiling control for all the multibuffers and the
+ * pitch of multibuffer 0 to 3. Unpipelined. The cache must be empty before any
+ * of the registers are changed.
+ *
+ * Bit 16: Larger tiles
+ * Bit 17: 4x2 tiles
+ * Bit 18: Extremely weird tile like, but some pixels duplicated?
+ */
+#define R300_RB3D_COLORPITCH0               0x4E38
+#       define R300_COLORPITCH_MASK              0x00003FFE
+#       define R300_COLOR_TILE_DISABLE            (0 << 16)
+#       define R300_COLOR_TILE_ENABLE             (1 << 16)
+#       define R300_COLOR_MICROTILE_DISABLE       (0 << 17)
+#       define R300_COLOR_MICROTILE_ENABLE        (1 << 17)
+#       define R300_COLOR_MICROTILE_ENABLE_SQUARE (2 << 17) /* Only available in 16-bit */
+#       define R300_COLOR_ENDIAN_NO_SWAP          (0 << 19)
+#       define R300_COLOR_ENDIAN_WORD_SWAP        (1 << 19)
+#       define R300_COLOR_ENDIAN_DWORD_SWAP       (2 << 19)
+#       define R300_COLOR_ENDIAN_HALF_DWORD_SWAP  (3 << 19)
+#	define R500_COLOR_FORMAT_ARGB10101010     (0 << 21)
+#	define R500_COLOR_FORMAT_UV1010           (1 << 21)
+#	define R500_COLOR_FORMAT_CI8              (2 << 21) /* 2D only */
+#	define R300_COLOR_FORMAT_ARGB1555         (3 << 21)
+#       define R300_COLOR_FORMAT_RGB565           (4 << 21)
+#       define R500_COLOR_FORMAT_ARGB2101010      (5 << 21)
+#       define R300_COLOR_FORMAT_ARGB8888         (6 << 21)
+#       define R300_COLOR_FORMAT_ARGB32323232     (7 << 21)
+/* reserved */
+#       define R300_COLOR_FORMAT_I8               (9 << 21)
+#       define R300_COLOR_FORMAT_ARGB16161616     (10 << 21)
+#       define R300_COLOR_FORMAT_VYUY             (11 << 21)
+#       define R300_COLOR_FORMAT_YVYU             (12 << 21)
+#       define R300_COLOR_FORMAT_UV88             (13 << 21)
+#       define R500_COLOR_FORMAT_I10              (14 << 21)
+#       define R300_COLOR_FORMAT_ARGB4444         (15 << 21)
+#define R300_RB3D_COLORPITCH1               0x4E3C
+#define R300_RB3D_COLORPITCH2               0x4E40
+#define R300_RB3D_COLORPITCH3               0x4E44
+
+/* gap */
+
+/* Destination Color Buffer Cache Control/Status. If the cb is in e2 mode, then
+ * a flush or free will not occur upon a write to this register, but a sync
+ * will be immediately sent if one is requested. If both DC_FLUSH and DC_FREE
+ * are zero but DC_FINISH is one, then a sync will be sent immediately -- the
+ * cb will not wait for all the previous operations to complete before sending
+ * the sync. Unpipelined except when DC_FINISH and DC_FREE are both set to
+ * zero.
+ *
+ * Set to 0A before 3D operations, set to 02 afterwards.
+ */
+#define R300_RB3D_DSTCACHE_CTLSTAT               0x4e4c
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_NO_EFFECT         (0 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_NO_EFFECT_1       (1 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D    (2 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D_1  (3 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_NO_EFFECT          (0 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_NO_EFFECT_1        (1 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS       (2 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS_1     (3 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_NO_SIGNAL        (0 << 4)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_SIGNAL           (1 << 4)
+
+#define R300_RB3D_DITHER_CTL 0x4E50
+#	define R300_RB3D_DITHER_CTL_DITHER_MODE_TRUNCATE         (0 << 0)
+#	define R300_RB3D_DITHER_CTL_DITHER_MODE_ROUND            (1 << 0)
+#	define R300_RB3D_DITHER_CTL_DITHER_MODE_LUT              (2 << 0)
+/* reserved */
+#	define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_TRUNCATE   (0 << 2)
+#	define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_ROUND      (1 << 2)
+#	define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_LUT        (2 << 2)
+/* reserved */
+
+/* Resolve buffer destination address. The cache must be empty before changing
+ * this register if the cb is in resolve mode. Unpipelined
+ */
+#define R300_RB3D_AARESOLVE_OFFSET        0x4e80
+#	define R300_RB3D_AARESOLVE_OFFSET_SHIFT 5
+#	define R300_RB3D_AARESOLVE_OFFSET_MASK 0xffffffe0 /* At least according to the calculations of Christoph Brill */
+
+/* Resolve Buffer Pitch and Tiling Control. The cache must be empty before
+ * changing this register if the cb is in resolve mode. Unpipelined
+ */
+#define R300_RB3D_AARESOLVE_PITCH         0x4e84
+#	define R300_RB3D_AARESOLVE_PITCH_SHIFT 1
+#	define R300_RB3D_AARESOLVE_PITCH_MASK  0x00003ffe /* At least according to the calculations of Christoph Brill */
+
+/* Resolve Buffer Control. Unpipelined */
+#define R300_RB3D_AARESOLVE_CTL           0x4e88
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_NORMAL   (0 << 0)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_RESOLVE  (1 << 0)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_10      (0 << 1)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_22      (1 << 1)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_SAMPLE0 (0 << 2)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_AVERAGE (1 << 2)
+
+
+/* Discard src pixels less than or equal to threshold. */
+#define R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD 0x4ea0
+/* Discard src pixels greater than or equal to threshold. */
+#define R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD 0x4ea4
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_BLUE_SHIFT 0
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_BLUE_MASK 0x000000ff
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_GREEN_SHIFT 8
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_GREEN_MASK 0x0000ff00
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_RED_SHIFT 16
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_RED_MASK 0x00ff0000
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_ALPHA_SHIFT 24
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_ALPHA_MASK 0xff000000
+
+/* 3D ROP Control. Stalls the 2d/3d datapath until it is idle. */
+#define R300_RB3D_ROPCNTL                             0x4e18
+#	define R300_RB3D_ROPCNTL_ROP_ENABLE            0x00000004
+#	define R300_RB3D_ROPCNTL_ROP_MASK              (15 << 8)
+#	define R300_RB3D_ROPCNTL_ROP_SHIFT             8
+
+/* Color Compare Flip. Stalls the 2d/3d datapath until it is idle. */
+#define R300_RB3D_CLRCMP_FLIPE                        0x4e1c
+
+/* Sets the fifo sizes */
+#define R500_RB3D_FIFO_SIZE                           0x4ef4
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_FULL   (0 << 0)
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_HALF   (1 << 0)
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_QUATER (2 << 0)
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_EIGTHS (3 << 0)
+
+/* Constant color used by the blender. Pipelined through the blender. */
+#define R500_RB3D_CONSTANT_COLOR_AR                   0x4ef8
+#	define R500_RB3D_CONSTANT_COLOR_AR_RED_MASK    0x0000ffff
+#	define R500_RB3D_CONSTANT_COLOR_AR_RED_SHIFT   0
+#	define R500_RB3D_CONSTANT_COLOR_AR_ALPHA_MASK  0xffff0000
+#	define R500_RB3D_CONSTANT_COLOR_AR_ALPHA_SHIFT 16
+
+/* Constant color used by the blender. Pipelined through the blender. */
+#define R500_RB3D_CONSTANT_COLOR_GB                   0x4efc
+#	define R500_RB3D_CONSTANT_COLOR_AR_BLUE_MASK   0x0000ffff
+#	define R500_RB3D_CONSTANT_COLOR_AR_BLUE_SHIFT  0
+#	define R500_RB3D_CONSTANT_COLOR_AR_GREEN_MASK  0xffff0000
+#	define R500_RB3D_CONSTANT_COLOR_AR_GREEN_SHIFT 16
+
+/* gap */
+/* There seems to be no "write only" setting, so use Z-test = ALWAYS
+ * for this.
+ * Bit (1<<8) is the "test" bit. so plain write is 6  - vd
+ */
+#define R300_ZB_CNTL                             0x4F00
+#	define R300_STENCIL_ENABLE		 (1 << 0)
+#	define R300_Z_ENABLE		         (1 << 1)
+#	define R300_Z_WRITE_ENABLE		 (1 << 2)
+#	define R300_Z_SIGNED_COMPARE		 (1 << 3)
+#	define R300_STENCIL_FRONT_BACK		 (1 << 4)
+
+#define R300_ZB_ZSTENCILCNTL                   0x4f04
+	/* functions */
+#	define R300_ZS_NEVER			0
+#	define R300_ZS_LESS			1
+#	define R300_ZS_LEQUAL			2
+#	define R300_ZS_EQUAL			3
+#	define R300_ZS_GEQUAL			4
+#	define R300_ZS_GREATER			5
+#	define R300_ZS_NOTEQUAL			6
+#	define R300_ZS_ALWAYS			7
+#       define R300_ZS_MASK                     7
+	/* operations */
+#	define R300_ZS_KEEP			0
+#	define R300_ZS_ZERO			1
+#	define R300_ZS_REPLACE			2
+#	define R300_ZS_INCR			3
+#	define R300_ZS_DECR			4
+#	define R300_ZS_INVERT			5
+#	define R300_ZS_INCR_WRAP		6
+#	define R300_ZS_DECR_WRAP		7
+#	define R300_Z_FUNC_SHIFT		0
+	/* front and back refer to operations done for front
+	   and back faces, i.e. separate stencil function support */
+#	define R300_S_FRONT_FUNC_SHIFT	        3
+#	define R300_S_FRONT_SFAIL_OP_SHIFT	6
+#	define R300_S_FRONT_ZPASS_OP_SHIFT	9
+#	define R300_S_FRONT_ZFAIL_OP_SHIFT      12
+#	define R300_S_BACK_FUNC_SHIFT           15
+#	define R300_S_BACK_SFAIL_OP_SHIFT       18
+#	define R300_S_BACK_ZPASS_OP_SHIFT       21
+#	define R300_S_BACK_ZFAIL_OP_SHIFT       24
+
+#define R300_ZB_STENCILREFMASK                        0x4f08
+#	define R300_STENCILREF_SHIFT       0
+#	define R300_STENCILREF_MASK        0x000000ff
+#	define R300_STENCILMASK_SHIFT      8
+#	define R300_STENCILMASK_MASK       0x0000ff00
+#	define R300_STENCILWRITEMASK_SHIFT 16
+#	define R300_STENCILWRITEMASK_MASK  0x00ff0000
+
+/* gap */
+
+#define R300_ZB_FORMAT                             0x4f10
+#	define R300_DEPTHFORMAT_16BIT_INT_Z   (0 << 0)
+#	define R300_DEPTHFORMAT_16BIT_13E3    (1 << 0)
+#	define R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL   (2 << 0)
+/* reserved up to (15 << 0) */
+#	define R300_INVERT_13E3_LEADING_ONES  (0 << 4)
+#	define R300_INVERT_13E3_LEADING_ZEROS (1 << 4)
+
+#define R300_ZB_ZTOP                             0x4F14
+#	define R300_ZTOP_DISABLE                 (0 << 0)
+#	define R300_ZTOP_ENABLE                  (1 << 0)
+
+/* gap */
+
+#define R300_ZB_ZCACHE_CTLSTAT            0x4f18
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_NO_EFFECT      (0 << 0)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE (1 << 0)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_NO_EFFECT       (0 << 1)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE            (1 << 1)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_BUSY_IDLE            (0 << 31)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_BUSY_BUSY            (1 << 31)
+
+#define R300_ZB_BW_CNTL                     0x4f1c
+#	define R300_HIZ_DISABLE                              (0 << 0)
+#	define R300_HIZ_ENABLE                               (1 << 0)
+#	define R300_HIZ_MIN                                  (0 << 1)
+#	define R300_HIZ_MAX                                  (1 << 1)
+#	define R300_FAST_FILL_DISABLE                        (0 << 2)
+#	define R300_FAST_FILL_ENABLE                         (1 << 2)
+#	define R300_RD_COMP_DISABLE                          (0 << 3)
+#	define R300_RD_COMP_ENABLE                           (1 << 3)
+#	define R300_WR_COMP_DISABLE                          (0 << 4)
+#	define R300_WR_COMP_ENABLE                           (1 << 4)
+#	define R300_ZB_CB_CLEAR_RMW                          (0 << 5)
+#	define R300_ZB_CB_CLEAR_CACHE_LINEAR                 (1 << 5)
+#	define R300_FORCE_COMPRESSED_STENCIL_VALUE_DISABLE   (0 << 6)
+#	define R300_FORCE_COMPRESSED_STENCIL_VALUE_ENABLE    (1 << 6)
+
+#	define R500_ZEQUAL_OPTIMIZE_ENABLE                   (0 << 7)
+#	define R500_ZEQUAL_OPTIMIZE_DISABLE                  (1 << 7)
+#	define R500_SEQUAL_OPTIMIZE_ENABLE                   (0 << 8)
+#	define R500_SEQUAL_OPTIMIZE_DISABLE                  (1 << 8)
+
+#	define R500_BMASK_ENABLE                             (0 << 10)
+#	define R500_BMASK_DISABLE                            (1 << 10)
+#	define R500_HIZ_EQUAL_REJECT_DISABLE                 (0 << 11)
+#	define R500_HIZ_EQUAL_REJECT_ENABLE                  (1 << 11)
+#	define R500_HIZ_FP_EXP_BITS_DISABLE                  (0 << 12)
+#	define R500_HIZ_FP_EXP_BITS_1                        (1 << 12)
+#	define R500_HIZ_FP_EXP_BITS_2                        (2 << 12)
+#	define R500_HIZ_FP_EXP_BITS_3                        (3 << 12)
+#	define R500_HIZ_FP_EXP_BITS_4                        (4 << 12)
+#	define R500_HIZ_FP_EXP_BITS_5                        (5 << 12)
+#	define R500_HIZ_FP_INVERT_LEADING_ONES               (0 << 15)
+#	define R500_HIZ_FP_INVERT_LEADING_ZEROS              (1 << 15)
+#	define R500_TILE_OVERWRITE_RECOMPRESSION_ENABLE      (0 << 16)
+#	define R500_TILE_OVERWRITE_RECOMPRESSION_DISABLE     (1 << 16)
+#	define R500_CONTIGUOUS_6XAA_SAMPLES_ENABLE           (0 << 17)
+#	define R500_CONTIGUOUS_6XAA_SAMPLES_DISABLE          (1 << 17)
+#	define R500_PEQ_PACKING_DISABLE                      (0 << 18)
+#	define R500_PEQ_PACKING_ENABLE                       (1 << 18)
+#	define R500_COVERED_PTR_MASKING_DISABLE              (0 << 18)
+#	define R500_COVERED_PTR_MASKING_ENABLE               (1 << 18)
+
+
+/* gap */
+
+/* Z Buffer Address Offset.
+ * Bits 31 to 5 are used for aligned Z buffer address offset for macro tiles.
+ */
+#define R300_ZB_DEPTHOFFSET               0x4f20
+
+/* Z Buffer Pitch and Endian Control */
+#define R300_ZB_DEPTHPITCH                0x4f24
+#       define R300_DEPTHPITCH_MASK              0x00003FFC
+#       define R300_DEPTHMACROTILE_DISABLE      (0 << 16)
+#       define R300_DEPTHMACROTILE_ENABLE       (1 << 16)
+#       define R300_DEPTHMICROTILE_LINEAR       (0 << 17)
+#       define R300_DEPTHMICROTILE_TILED        (1 << 17)
+#       define R300_DEPTHMICROTILE_TILED_SQUARE (2 << 17)
+#       define R300_DEPTHENDIAN_NO_SWAP         (0 << 18)
+#       define R300_DEPTHENDIAN_WORD_SWAP       (1 << 18)
+#       define R300_DEPTHENDIAN_DWORD_SWAP      (2 << 18)
+#       define R300_DEPTHENDIAN_HALF_DWORD_SWAP (3 << 18)
+
+/* Z Buffer Clear Value */
+#define R300_ZB_DEPTHCLEARVALUE                  0x4f28
+
+/* Hierarchical Z Memory Offset */
+#define R300_ZB_HIZ_OFFSET                       0x4f44
+
+/* Hierarchical Z Write Index */
+#define R300_ZB_HIZ_WRINDEX                      0x4f48
+
+/* Hierarchical Z Data */
+#define R300_ZB_HIZ_DWORD                        0x4f4c
+
+/* Hierarchical Z Read Index */
+#define R300_ZB_HIZ_RDINDEX                      0x4f50
+
+/* Hierarchical Z Pitch */
+#define R300_ZB_HIZ_PITCH                        0x4f54
+
+/* Z Buffer Z Pass Counter Data */
+#define R300_ZB_ZPASS_DATA                       0x4f58
+
+/* Z Buffer Z Pass Counter Address */
+#define R300_ZB_ZPASS_ADDR                       0x4f5c
+
+/* Depth buffer X and Y coordinate offset */
+#define R300_ZB_DEPTHXY_OFFSET                   0x4f60
+#	define R300_DEPTHX_OFFSET_SHIFT  1
+#	define R300_DEPTHX_OFFSET_MASK   0x000007FE
+#	define R300_DEPTHY_OFFSET_SHIFT  17
+#	define R300_DEPTHY_OFFSET_MASK   0x07FE0000
+
+/* Sets the fifo sizes */
+#define R500_ZB_FIFO_SIZE                        0x4fd0
+#	define R500_OP_FIFO_SIZE_FULL   (0 << 0)
+#	define R500_OP_FIFO_SIZE_HALF   (1 << 0)
+#	define R500_OP_FIFO_SIZE_QUATER (2 << 0)
+#	define R500_OP_FIFO_SIZE_EIGTHS (4 << 0)
+
+/* Stencil Reference Value and Mask for backfacing quads */
+/* R300_ZB_STENCILREFMASK handles front face */
+#define R500_ZB_STENCILREFMASK_BF                0x4fd4
+#	define R500_STENCILREF_SHIFT       0
+#	define R500_STENCILREF_MASK        0x000000ff
+#	define R500_STENCILMASK_SHIFT      8
+#	define R500_STENCILMASK_MASK       0x0000ff00
+#	define R500_STENCILWRITEMASK_SHIFT 16
+#	define R500_STENCILWRITEMASK_MASK  0x00ff0000
+
+/**
+ * \defgroup R3XX_R5XX_PROGRAMMABLE_VERTEX_SHADER_DESCRIPTION R3XX-R5XX PROGRAMMABLE VERTEX SHADER DESCRIPTION
+ *
+ * The PVS_DST_MATH_INST is used to identify whether the instruction is a Vector
+ * Engine instruction or a Math Engine instruction.
+ */
+
+/*\{*/
+
+enum {
+	/* R3XX */
+	VECTOR_NO_OP			= 0,
+	VE_DOT_PRODUCT			= 1,
+	VE_MULTIPLY			= 2,
+	VE_ADD				= 3,
+	VE_MULTIPLY_ADD			= 4,
+	VE_DISTANCE_VECTOR		= 5,
+	VE_FRACTION			= 6,
+	VE_MAXIMUM			= 7,
+	VE_MINIMUM			= 8,
+	VE_SET_GREATER_THAN_EQUAL	= 9,
+	VE_SET_LESS_THAN		= 10,
+	VE_MULTIPLYX2_ADD		= 11,
+	VE_MULTIPLY_CLAMP		= 12,
+	VE_FLT2FIX_DX			= 13,
+	VE_FLT2FIX_DX_RND		= 14,
+	/* R5XX */
+	VE_PRED_SET_EQ_PUSH		= 15,
+	VE_PRED_SET_GT_PUSH		= 16,
+	VE_PRED_SET_GTE_PUSH		= 17,
+	VE_PRED_SET_NEQ_PUSH		= 18,
+	VE_COND_WRITE_EQ		= 19,
+	VE_COND_WRITE_GT		= 20,
+	VE_COND_WRITE_GTE		= 21,
+	VE_COND_WRITE_NEQ		= 22,
+	VE_COND_MUX_EQ			= 23,
+	VE_COND_MUX_GT			= 24,
+	VE_COND_MUX_GTE			= 25,
+	VE_SET_GREATER_THAN		= 26,
+	VE_SET_EQUAL			= 27,
+	VE_SET_NOT_EQUAL		= 28,
+};
+
+enum {
+	/* R3XX */
+	MATH_NO_OP			= 0,
+	ME_EXP_BASE2_DX			= 1,
+	ME_LOG_BASE2_DX			= 2,
+	ME_EXP_BASEE_FF			= 3,
+	ME_LIGHT_COEFF_DX		= 4,
+	ME_POWER_FUNC_FF		= 5,
+	ME_RECIP_DX			= 6,
+	ME_RECIP_FF			= 7,
+	ME_RECIP_SQRT_DX		= 8,
+	ME_RECIP_SQRT_FF		= 9,
+	ME_MULTIPLY			= 10,
+	ME_EXP_BASE2_FULL_DX		= 11,
+	ME_LOG_BASE2_FULL_DX		= 12,
+	ME_POWER_FUNC_FF_CLAMP_B	= 13,
+	ME_POWER_FUNC_FF_CLAMP_B1	= 14,
+	ME_POWER_FUNC_FF_CLAMP_01	= 15,
+	ME_SIN				= 16,
+	ME_COS				= 17,
+	/* R5XX */
+	ME_LOG_BASE2_IEEE		= 18,
+	ME_RECIP_IEEE			= 19,
+	ME_RECIP_SQRT_IEEE		= 20,
+	ME_PRED_SET_EQ			= 21,
+	ME_PRED_SET_GT			= 22,
+	ME_PRED_SET_GTE			= 23,
+	ME_PRED_SET_NEQ			= 24,
+	ME_PRED_SET_CLR			= 25,
+	ME_PRED_SET_INV			= 26,
+	ME_PRED_SET_POP			= 27,
+	ME_PRED_SET_RESTORE		= 28,
+};
+
+enum {
+	/* R3XX */
+	PVS_MACRO_OP_2CLK_MADD		= 0,
+	PVS_MACRO_OP_2CLK_M2X_ADD	= 1,
+};
+
+enum {
+	PVS_SRC_REG_TEMPORARY		= 0,	/* Intermediate Storage */
+	PVS_SRC_REG_INPUT		= 1,	/* Input Vertex Storage */
+	PVS_SRC_REG_CONSTANT		= 2,	/* Constant State Storage */
+	PVS_SRC_REG_ALT_TEMPORARY	= 3,	/* Alternate Intermediate Storage */
+};
+
+enum {
+	PVS_DST_REG_TEMPORARY		= 0,	/* Intermediate Storage */
+	PVS_DST_REG_A0			= 1,	/* Address Register Storage */
+	PVS_DST_REG_OUT			= 2,	/* Output Memory. Used for all outputs */
+	PVS_DST_REG_OUT_REPL_X		= 3,	/* Output Memory & Replicate X to all channels */
+	PVS_DST_REG_ALT_TEMPORARY	= 4,	/* Alternate Intermediate Storage */
+	PVS_DST_REG_INPUT		= 5,	/* Output Memory & Replicate X to all channels */
+};
+
+enum {
+	PVS_SRC_SELECT_X		= 0,	/* Select X Component */
+	PVS_SRC_SELECT_Y		= 1,	/* Select Y Component */
+	PVS_SRC_SELECT_Z		= 2,	/* Select Z Component */
+	PVS_SRC_SELECT_W		= 3,	/* Select W Component */
+	PVS_SRC_SELECT_FORCE_0		= 4,	/* Force Component to 0.0 */
+	PVS_SRC_SELECT_FORCE_1		= 5,	/* Force Component to 1.0 */
+};
+
+/* PVS Opcode & Destination Operand Description */
+
+enum {
+	PVS_DST_OPCODE_MASK		= 0x3f,
+	PVS_DST_OPCODE_SHIFT		= 0,
+	PVS_DST_MATH_INST_MASK		= 0x1,
+	PVS_DST_MATH_INST_SHIFT		= 6,
+	PVS_DST_MACRO_INST_MASK		= 0x1,
+	PVS_DST_MACRO_INST_SHIFT	= 7,
+	PVS_DST_REG_TYPE_MASK		= 0xf,
+	PVS_DST_REG_TYPE_SHIFT		= 8,
+	PVS_DST_ADDR_MODE_1_MASK	= 0x1,
+	PVS_DST_ADDR_MODE_1_SHIFT	= 12,
+	PVS_DST_OFFSET_MASK		= 0x7f,
+	PVS_DST_OFFSET_SHIFT		= 13,
+	PVS_DST_WE_X_MASK		= 0x1,
+	PVS_DST_WE_X_SHIFT		= 20,
+	PVS_DST_WE_Y_MASK		= 0x1,
+	PVS_DST_WE_Y_SHIFT		= 21,
+	PVS_DST_WE_Z_MASK		= 0x1,
+	PVS_DST_WE_Z_SHIFT		= 22,
+	PVS_DST_WE_W_MASK		= 0x1,
+	PVS_DST_WE_W_SHIFT		= 23,
+	PVS_DST_VE_SAT_MASK		= 0x1,
+	PVS_DST_VE_SAT_SHIFT		= 24,
+	PVS_DST_ME_SAT_MASK		= 0x1,
+	PVS_DST_ME_SAT_SHIFT		= 25,
+	PVS_DST_PRED_ENABLE_MASK	= 0x1,
+	PVS_DST_PRED_ENABLE_SHIFT	= 26,
+	PVS_DST_PRED_SENSE_MASK		= 0x1,
+	PVS_DST_PRED_SENSE_SHIFT	= 27,
+	PVS_DST_DUAL_MATH_OP_MASK	= 0x3,
+	PVS_DST_DUAL_MATH_OP_SHIFT	= 27,
+	PVS_DST_ADDR_SEL_MASK		= 0x3,
+	PVS_DST_ADDR_SEL_SHIFT		= 29,
+	PVS_DST_ADDR_MODE_0_MASK	= 0x1,
+	PVS_DST_ADDR_MODE_0_SHIFT	= 31,
+};
+
+/* PVS Source Operand Description */
+
+enum {
+	PVS_SRC_REG_TYPE_MASK		= 0x3,
+	PVS_SRC_REG_TYPE_SHIFT		= 0,
+	SPARE_0_MASK			= 0x1,
+	SPARE_0_SHIFT			= 2,
+	PVS_SRC_ABS_XYZW_MASK		= 0x1,
+	PVS_SRC_ABS_XYZW_SHIFT		= 3,
+	PVS_SRC_ADDR_MODE_0_MASK	= 0x1,
+	PVS_SRC_ADDR_MODE_0_SHIFT	= 4,
+	PVS_SRC_OFFSET_MASK		= 0xff,
+	PVS_SRC_OFFSET_SHIFT		= 5,
+	PVS_SRC_SWIZZLE_X_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_X_SHIFT		= 13,
+	PVS_SRC_SWIZZLE_Y_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_Y_SHIFT		= 16,
+	PVS_SRC_SWIZZLE_Z_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_Z_SHIFT		= 19,
+	PVS_SRC_SWIZZLE_W_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_W_SHIFT		= 22,
+	PVS_SRC_MODIFIER_X_MASK		= 0x1,
+	PVS_SRC_MODIFIER_X_SHIFT	= 25,
+	PVS_SRC_MODIFIER_Y_MASK		= 0x1,
+	PVS_SRC_MODIFIER_Y_SHIFT	= 26,
+	PVS_SRC_MODIFIER_Z_MASK		= 0x1,
+	PVS_SRC_MODIFIER_Z_SHIFT	= 27,
+	PVS_SRC_MODIFIER_W_MASK		= 0x1,
+	PVS_SRC_MODIFIER_W_SHIFT	= 28,
+	PVS_SRC_ADDR_SEL_MASK		= 0x3,
+	PVS_SRC_ADDR_SEL_SHIFT		= 29,
+	PVS_SRC_ADDR_MODE_1_MASK	= 0x0,
+	PVS_SRC_ADDR_MODE_1_SHIFT	= 32,
+};
+
+/*\}*/
+
+/* BEGIN: Packet 3 commands */
+
+/* A primitive emission dword. */
+#define R300_PRIM_TYPE_NONE                     (0 << 0)
+#define R300_PRIM_TYPE_POINT                    (1 << 0)
+#define R300_PRIM_TYPE_LINE                     (2 << 0)
+#define R300_PRIM_TYPE_LINE_STRIP               (3 << 0)
+#define R300_PRIM_TYPE_TRI_LIST                 (4 << 0)
+#define R300_PRIM_TYPE_TRI_FAN                  (5 << 0)
+#define R300_PRIM_TYPE_TRI_STRIP                (6 << 0)
+#define R300_PRIM_TYPE_TRI_TYPE2                (7 << 0)
+#define R300_PRIM_TYPE_RECT_LIST                (8 << 0)
+#define R300_PRIM_TYPE_3VRT_POINT_LIST          (9 << 0)
+#define R300_PRIM_TYPE_3VRT_LINE_LIST           (10 << 0)
+	/* GUESS (based on r200) */
+#define R300_PRIM_TYPE_POINT_SPRITES            (11 << 0)
+#define R300_PRIM_TYPE_LINE_LOOP                (12 << 0)
+#define R300_PRIM_TYPE_QUADS                    (13 << 0)
+#define R300_PRIM_TYPE_QUAD_STRIP               (14 << 0)
+#define R300_PRIM_TYPE_POLYGON                  (15 << 0)
+#define R300_PRIM_TYPE_MASK                     0xF
+#define R300_PRIM_WALK_IND                      (1 << 4)
+#define R300_PRIM_WALK_LIST                     (2 << 4)
+#define R300_PRIM_WALK_RING                     (3 << 4)
+#define R300_PRIM_WALK_MASK                     (3 << 4)
+	/* GUESS (based on r200) */
+#define R300_PRIM_COLOR_ORDER_BGRA              (0 << 6)
+#define R300_PRIM_COLOR_ORDER_RGBA              (1 << 6)
+#define R300_PRIM_NUM_VERTICES_SHIFT            16
+#define R300_PRIM_NUM_VERTICES_MASK             0xffff
+
+
+
+/*
+ * The R500 unified shader (US) registers come in banks of 512 each, one
+ * for each instruction slot in the shader.  You can't touch them directly.
+ * R500_US_VECTOR_INDEX() sets the base instruction to modify; successive
+ * writes to R500_GA_US_VECTOR_DATA autoincrement the index after the
+ * instruction is fully specified.
+ */
+#define R500_US_ALU_ALPHA_INST_0			0xa800
+#   define R500_ALPHA_OP_MAD				0
+#   define R500_ALPHA_OP_DP				1
+#   define R500_ALPHA_OP_MIN				2
+#   define R500_ALPHA_OP_MAX				3
+/* #define R500_ALPHA_OP_RESERVED			4 */
+#   define R500_ALPHA_OP_CND				5
+#   define R500_ALPHA_OP_CMP				6
+#   define R500_ALPHA_OP_FRC				7
+#   define R500_ALPHA_OP_EX2				8
+#   define R500_ALPHA_OP_LN2				9
+#   define R500_ALPHA_OP_RCP				10
+#   define R500_ALPHA_OP_RSQ				11
+#   define R500_ALPHA_OP_SIN				12
+#   define R500_ALPHA_OP_COS				13
+#   define R500_ALPHA_OP_MDH				14
+#   define R500_ALPHA_OP_MDV				15
+#   define R500_ALPHA_ADDRD(x)				(x << 4)
+#   define R500_ALPHA_ADDRD_REL				(1 << 11)
+#  define R500_ALPHA_SEL_A_SHIFT			12
+#   define R500_ALPHA_SEL_A_SRC0			(0 << 12)
+#   define R500_ALPHA_SEL_A_SRC1			(1 << 12)
+#   define R500_ALPHA_SEL_A_SRC2			(2 << 12)
+#   define R500_ALPHA_SEL_A_SRCP			(3 << 12)
+#   define R500_ALPHA_SWIZ_A_R				(0 << 14)
+#   define R500_ALPHA_SWIZ_A_G				(1 << 14)
+#   define R500_ALPHA_SWIZ_A_B				(2 << 14)
+#   define R500_ALPHA_SWIZ_A_A				(3 << 14)
+#   define R500_ALPHA_SWIZ_A_0				(4 << 14)
+#   define R500_ALPHA_SWIZ_A_HALF			(5 << 14)
+#   define R500_ALPHA_SWIZ_A_1				(6 << 14)
+/* #define R500_ALPHA_SWIZ_A_UNUSED			(7 << 14) */
+#   define R500_ALPHA_MOD_A_NOP				(0 << 17)
+#   define R500_ALPHA_MOD_A_NEG				(1 << 17)
+#   define R500_ALPHA_MOD_A_ABS				(2 << 17)
+#   define R500_ALPHA_MOD_A_NAB				(3 << 17)
+#  define R500_ALPHA_SEL_B_SHIFT			19
+#   define R500_ALPHA_SEL_B_SRC0			(0 << 19)
+#   define R500_ALPHA_SEL_B_SRC1			(1 << 19)
+#   define R500_ALPHA_SEL_B_SRC2			(2 << 19)
+#   define R500_ALPHA_SEL_B_SRCP			(3 << 19)
+#   define R500_ALPHA_SWIZ_B_R				(0 << 21)
+#   define R500_ALPHA_SWIZ_B_G				(1 << 21)
+#   define R500_ALPHA_SWIZ_B_B				(2 << 21)
+#   define R500_ALPHA_SWIZ_B_A				(3 << 21)
+#   define R500_ALPHA_SWIZ_B_0				(4 << 21)
+#   define R500_ALPHA_SWIZ_B_HALF			(5 << 21)
+#   define R500_ALPHA_SWIZ_B_1				(6 << 21)
+/* #define R500_ALPHA_SWIZ_B_UNUSED			(7 << 21) */
+#   define R500_ALPHA_MOD_B_NOP				(0 << 24)
+#   define R500_ALPHA_MOD_B_NEG				(1 << 24)
+#   define R500_ALPHA_MOD_B_ABS				(2 << 24)
+#   define R500_ALPHA_MOD_B_NAB				(3 << 24)
+#   define R500_ALPHA_OMOD_IDENTITY			(0 << 26)
+#   define R500_ALPHA_OMOD_MUL_2			(1 << 26)
+#   define R500_ALPHA_OMOD_MUL_4			(2 << 26)
+#   define R500_ALPHA_OMOD_MUL_8			(3 << 26)
+#   define R500_ALPHA_OMOD_DIV_2			(4 << 26)
+#   define R500_ALPHA_OMOD_DIV_4			(5 << 26)
+#   define R500_ALPHA_OMOD_DIV_8			(6 << 26)
+#   define R500_ALPHA_OMOD_DISABLE			(7 << 26)
+#   define R500_ALPHA_TARGET(x)				(x << 29)
+#   define R500_ALPHA_W_OMASK				(1 << 31)
+#define R500_US_ALU_ALPHA_ADDR_0			0x9800
+#   define R500_ALPHA_ADDR0(x)				(x << 0)
+#   define R500_ALPHA_ADDR0_CONST			(1 << 8)
+#   define R500_ALPHA_ADDR0_REL				(1 << 9)
+#   define R500_ALPHA_ADDR1(x)				(x << 10)
+#   define R500_ALPHA_ADDR1_CONST			(1 << 18)
+#   define R500_ALPHA_ADDR1_REL				(1 << 19)
+#   define R500_ALPHA_ADDR2(x)				(x << 20)
+#   define R500_ALPHA_ADDR2_CONST			(1 << 28)
+#   define R500_ALPHA_ADDR2_REL				(1 << 29)
+#   define R500_ALPHA_SRCP_OP_1_MINUS_2A0		(0 << 30)
+#   define R500_ALPHA_SRCP_OP_A1_MINUS_A0		(1 << 30)
+#   define R500_ALPHA_SRCP_OP_A1_PLUS_A0		(2 << 30)
+#   define R500_ALPHA_SRCP_OP_1_MINUS_A0		(3 << 30)
+#define R500_US_ALU_RGBA_INST_0				0xb000
+#   define R500_ALU_RGBA_OP_MAD				(0 << 0)
+#   define R500_ALU_RGBA_OP_DP3				(1 << 0)
+#   define R500_ALU_RGBA_OP_DP4				(2 << 0)
+#   define R500_ALU_RGBA_OP_D2A				(3 << 0)
+#   define R500_ALU_RGBA_OP_MIN				(4 << 0)
+#   define R500_ALU_RGBA_OP_MAX				(5 << 0)
+/* #define R500_ALU_RGBA_OP_RESERVED			(6 << 0) */
+#   define R500_ALU_RGBA_OP_CND				(7 << 0)
+#   define R500_ALU_RGBA_OP_CMP				(8 << 0)
+#   define R500_ALU_RGBA_OP_FRC				(9 << 0)
+#   define R500_ALU_RGBA_OP_SOP				(10 << 0)
+#   define R500_ALU_RGBA_OP_MDH				(11 << 0)
+#   define R500_ALU_RGBA_OP_MDV				(12 << 0)
+#   define R500_ALU_RGBA_ADDRD(x)			(x << 4)
+#   define R500_ALU_RGBA_ADDRD_REL			(1 << 11)
+#  define R500_ALU_RGBA_SEL_C_SHIFT			12
+#   define R500_ALU_RGBA_SEL_C_SRC0			(0 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRC1			(1 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRC2			(2 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRCP			(3 << 12)
+#   define R500_ALU_RGBA_R_SWIZ_R			(0 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_G			(1 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_B			(2 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_A			(3 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_0			(4 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_HALF			(5 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_1			(6 << 14)
+/* #define R500_ALU_RGBA_R_SWIZ_UNUSED			(7 << 14) */
+#   define R500_ALU_RGBA_G_SWIZ_R			(0 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_G			(1 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_B			(2 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_A			(3 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_0			(4 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_HALF			(5 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_1			(6 << 17)
+/* #define R500_ALU_RGBA_G_SWIZ_UNUSED			(7 << 17) */
+#   define R500_ALU_RGBA_B_SWIZ_R			(0 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_G			(1 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_B			(2 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_A			(3 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_0			(4 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_HALF			(5 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_1			(6 << 20)
+/* #define R500_ALU_RGBA_B_SWIZ_UNUSED			(7 << 20) */
+#   define R500_ALU_RGBA_MOD_C_NOP			(0 << 23)
+#   define R500_ALU_RGBA_MOD_C_NEG			(1 << 23)
+#   define R500_ALU_RGBA_MOD_C_ABS			(2 << 23)
+#   define R500_ALU_RGBA_MOD_C_NAB			(3 << 23)
+#  define R500_ALU_RGBA_ALPHA_SEL_C_SHIFT		25
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC0		(0 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC1		(1 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC2		(2 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRCP		(3 << 25)
+#   define R500_ALU_RGBA_A_SWIZ_R			(0 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_G			(1 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_B			(2 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_A			(3 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_0			(4 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_HALF			(5 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_1			(6 << 27)
+/* #define R500_ALU_RGBA_A_SWIZ_UNUSED			(7 << 27) */
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NOP		(0 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NEG		(1 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_ABS		(2 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NAB		(3 << 30)
+#define R500_US_ALU_RGB_INST_0				0xa000
+#  define R500_ALU_RGB_SEL_A_SHIFT			0
+#   define R500_ALU_RGB_SEL_A_SRC0			(0 << 0)
+#   define R500_ALU_RGB_SEL_A_SRC1			(1 << 0)
+#   define R500_ALU_RGB_SEL_A_SRC2			(2 << 0)
+#   define R500_ALU_RGB_SEL_A_SRCP			(3 << 0)
+#   define R500_ALU_RGB_R_SWIZ_A_R			(0 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_G			(1 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_B			(2 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_A			(3 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_0			(4 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_HALF			(5 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_1			(6 << 2)
+/* #define R500_ALU_RGB_R_SWIZ_A_UNUSED			(7 << 2) */
+#   define R500_ALU_RGB_G_SWIZ_A_R			(0 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_G			(1 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_B			(2 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_A			(3 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_0			(4 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_HALF			(5 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_1			(6 << 5)
+/* #define R500_ALU_RGB_G_SWIZ_A_UNUSED			(7 << 5) */
+#   define R500_ALU_RGB_B_SWIZ_A_R			(0 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_G			(1 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_B			(2 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_A			(3 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_0			(4 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_HALF			(5 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_1			(6 << 8)
+/* #define R500_ALU_RGB_B_SWIZ_A_UNUSED			(7 << 8) */
+#   define R500_ALU_RGB_MOD_A_NOP			(0 << 11)
+#   define R500_ALU_RGB_MOD_A_NEG			(1 << 11)
+#   define R500_ALU_RGB_MOD_A_ABS			(2 << 11)
+#   define R500_ALU_RGB_MOD_A_NAB			(3 << 11)
+#  define R500_ALU_RGB_SEL_B_SHIFT			13
+#   define R500_ALU_RGB_SEL_B_SRC0			(0 << 13)
+#   define R500_ALU_RGB_SEL_B_SRC1			(1 << 13)
+#   define R500_ALU_RGB_SEL_B_SRC2			(2 << 13)
+#   define R500_ALU_RGB_SEL_B_SRCP			(3 << 13)
+#   define R500_ALU_RGB_R_SWIZ_B_R			(0 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_G			(1 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_B			(2 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_A			(3 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_0			(4 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_HALF			(5 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_1			(6 << 15)
+/* #define R500_ALU_RGB_R_SWIZ_B_UNUSED			(7 << 15) */
+#   define R500_ALU_RGB_G_SWIZ_B_R			(0 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_G			(1 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_B			(2 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_A			(3 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_0			(4 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_HALF			(5 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_1			(6 << 18)
+/* #define R500_ALU_RGB_G_SWIZ_B_UNUSED			(7 << 18) */
+#   define R500_ALU_RGB_B_SWIZ_B_R			(0 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_G			(1 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_B			(2 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_A			(3 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_0			(4 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_HALF			(5 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_1			(6 << 21)
+/* #define R500_ALU_RGB_B_SWIZ_B_UNUSED			(7 << 21) */
+#   define R500_ALU_RGB_MOD_B_NOP			(0 << 24)
+#   define R500_ALU_RGB_MOD_B_NEG			(1 << 24)
+#   define R500_ALU_RGB_MOD_B_ABS			(2 << 24)
+#   define R500_ALU_RGB_MOD_B_NAB			(3 << 24)
+#   define R500_ALU_RGB_OMOD_IDENTITY			(0 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_2			(1 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_4			(2 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_8			(3 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_2			(4 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_4			(5 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_8			(6 << 26)
+#   define R500_ALU_RGB_OMOD_DISABLE			(7 << 26)
+#   define R500_ALU_RGB_TARGET(x)			(x << 29)
+#   define R500_ALU_RGB_WMASK				(1 << 31)
+#define R500_US_ALU_RGB_ADDR_0				0x9000
+#   define R500_RGB_ADDR0(x)				(x << 0)
+#   define R500_RGB_ADDR0_CONST				(1 << 8)
+#   define R500_RGB_ADDR0_REL				(1 << 9)
+#   define R500_RGB_ADDR1(x)				(x << 10)
+#   define R500_RGB_ADDR1_CONST				(1 << 18)
+#   define R500_RGB_ADDR1_REL				(1 << 19)
+#   define R500_RGB_ADDR2(x)				(x << 20)
+#   define R500_RGB_ADDR2_CONST				(1 << 28)
+#   define R500_RGB_ADDR2_REL				(1 << 29)
+#   define R500_RGB_SRCP_OP_1_MINUS_2RGB0		(0 << 30)
+#   define R500_RGB_SRCP_OP_RGB1_MINUS_RGB0		(1 << 30)
+#   define R500_RGB_SRCP_OP_RGB1_PLUS_RGB0		(2 << 30)
+#   define R500_RGB_SRCP_OP_1_MINUS_RGB0		(3 << 30)
+#define R500_US_CMN_INST_0				0xb800
+#  define R500_INST_TYPE_MASK				(3 << 0)
+#   define R500_INST_TYPE_ALU				(0 << 0)
+#   define R500_INST_TYPE_OUT				(1 << 0)
+#   define R500_INST_TYPE_FC				(2 << 0)
+#   define R500_INST_TYPE_TEX				(3 << 0)
+#   define R500_INST_TEX_SEM_WAIT			(1 << 2)
+#   define R500_INST_RGB_PRED_SEL_NONE			(0 << 3)
+#   define R500_INST_RGB_PRED_SEL_RGBA			(1 << 3)
+#   define R500_INST_RGB_PRED_SEL_RRRR			(2 << 3)
+#   define R500_INST_RGB_PRED_SEL_GGGG			(3 << 3)
+#   define R500_INST_RGB_PRED_SEL_BBBB			(4 << 3)
+#   define R500_INST_RGB_PRED_SEL_AAAA			(5 << 3)
+#   define R500_INST_RGB_PRED_INV			(1 << 6)
+#   define R500_INST_WRITE_INACTIVE			(1 << 7)
+#   define R500_INST_LAST				(1 << 8)
+#   define R500_INST_NOP				(1 << 9)
+#   define R500_INST_ALU_WAIT				(1 << 10)
+#   define R500_INST_RGB_WMASK_R			(1 << 11)
+#   define R500_INST_RGB_WMASK_G			(1 << 12)
+#   define R500_INST_RGB_WMASK_B			(1 << 13)
+#   define R500_INST_ALPHA_WMASK			(1 << 14)
+#   define R500_INST_RGB_OMASK_R			(1 << 15)
+#   define R500_INST_RGB_OMASK_G			(1 << 16)
+#   define R500_INST_RGB_OMASK_B			(1 << 17)
+#   define R500_INST_ALPHA_OMASK			(1 << 18)
+#   define R500_INST_RGB_CLAMP				(1 << 19)
+#   define R500_INST_ALPHA_CLAMP			(1 << 20)
+#   define R500_INST_ALU_RESULT_SEL			(1 << 21)
+#   define R500_INST_ALPHA_PRED_INV			(1 << 22)
+#   define R500_INST_ALU_RESULT_OP_EQ			(0 << 23)
+#   define R500_INST_ALU_RESULT_OP_LT			(1 << 23)
+#   define R500_INST_ALU_RESULT_OP_GE			(2 << 23)
+#   define R500_INST_ALU_RESULT_OP_NE			(3 << 23)
+#   define R500_INST_ALPHA_PRED_SEL_NONE		(0 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_RGBA		(1 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_RRRR		(2 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_GGGG		(3 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_BBBB		(4 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_AAAA		(5 << 25)
+/* XXX next four are kind of guessed */
+#   define R500_INST_STAT_WE_R				(1 << 28)
+#   define R500_INST_STAT_WE_G				(1 << 29)
+#   define R500_INST_STAT_WE_B				(1 << 30)
+#   define R500_INST_STAT_WE_A				(1 << 31)
+
+/* note that these are 8 bit lengths, despite the offsets, at least for R500 */
+#define R500_US_CODE_ADDR				0x4630
+#   define R500_US_CODE_START_ADDR(x)			(x << 0)
+#   define R500_US_CODE_END_ADDR(x)			(x << 16)
+#define R500_US_CODE_OFFSET				0x4638
+#   define R500_US_CODE_OFFSET_ADDR(x)			(x << 0)
+#define R500_US_CODE_RANGE				0x4634
+#   define R500_US_CODE_RANGE_ADDR(x)			(x << 0)
+#   define R500_US_CODE_RANGE_SIZE(x)			(x << 16)
+#define R500_US_CONFIG					0x4600
+#   define R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO		(1 << 1)
+#define R500_US_FC_ADDR_0				0xa000
+#   define R500_FC_BOOL_ADDR(x)				(x << 0)
+#   define R500_FC_INT_ADDR(x)				(x << 8)
+#   define R500_FC_JUMP_ADDR(x)				(x << 16)
+#   define R500_FC_JUMP_GLOBAL				(1 << 31)
+#define R500_US_FC_BOOL_CONST				0x4620
+#   define R500_FC_KBOOL(x)				(x)
+#define R500_US_FC_CTRL					0x4624
+#   define R500_FC_TEST_EN				(1 << 30)
+#   define R500_FC_FULL_FC_EN				(1 << 31)
+#define R500_US_FC_INST_0				0x9800
+#   define R500_FC_OP_JUMP				(0 << 0)
+#   define R500_FC_OP_LOOP				(1 << 0)
+#   define R500_FC_OP_ENDLOOP				(2 << 0)
+#   define R500_FC_OP_REP				(3 << 0)
+#   define R500_FC_OP_ENDREP				(4 << 0)
+#   define R500_FC_OP_BREAKLOOP				(5 << 0)
+#   define R500_FC_OP_BREAKREP				(6 << 0)
+#   define R500_FC_OP_CONTINUE				(7 << 0)
+#   define R500_FC_B_ELSE				(1 << 4)
+#   define R500_FC_JUMP_ANY				(1 << 5)
+#   define R500_FC_A_OP_NONE				(0 << 6)
+#   define R500_FC_A_OP_POP				(1 << 6)
+#   define R500_FC_A_OP_PUSH				(2 << 6)
+#   define R500_FC_JUMP_FUNC(x)				(x << 8)
+#   define R500_FC_B_POP_CNT(x)				(x << 16)
+#   define R500_FC_B_OP0_NONE				(0 << 24)
+#   define R500_FC_B_OP0_DECR				(1 << 24)
+#   define R500_FC_B_OP0_INCR				(2 << 24)
+#   define R500_FC_B_OP1_DECR				(0 << 26)
+#   define R500_FC_B_OP1_NONE				(1 << 26)
+#   define R500_FC_B_OP1_INCR				(2 << 26)
+#   define R500_FC_IGNORE_UNCOVERED			(1 << 28)
+#define R500_US_FC_INT_CONST_0				0x4c00
+#   define R500_FC_INT_CONST_KR(x)			(x << 0)
+#   define R500_FC_INT_CONST_KG(x)			(x << 8)
+#   define R500_FC_INT_CONST_KB(x)			(x << 16)
+/* _0 through _15 */
+#define R500_US_FORMAT0_0				0x4640
+#   define R500_FORMAT_TXWIDTH(x)			(x << 0)
+#   define R500_FORMAT_TXHEIGHT(x)			(x << 11)
+#   define R500_FORMAT_TXDEPTH(x)			(x << 22)
+/* _0 through _3 */
+#define R500_US_OUT_FMT_0				0x46A4
+#   define R500_OUT_FMT_C4_8				(0 << 0)
+#   define R500_OUT_FMT_C4_10				(1 << 0)
+#   define R500_OUT_FMT_C4_10_GAMMA			(2 << 0)
+#   define R500_OUT_FMT_C_16				(3 << 0)
+#   define R500_OUT_FMT_C2_16				(4 << 0)
+#   define R500_OUT_FMT_C4_16				(5 << 0)
+#   define R500_OUT_FMT_C_16_MPEG			(6 << 0)
+#   define R500_OUT_FMT_C2_16_MPEG			(7 << 0)
+#   define R500_OUT_FMT_C2_4				(8 << 0)
+#   define R500_OUT_FMT_C_3_3_2				(9 << 0)
+#   define R500_OUT_FMT_C_6_5_6				(10 << 0)
+#   define R500_OUT_FMT_C_11_11_10			(11 << 0)
+#   define R500_OUT_FMT_C_10_11_11			(12 << 0)
+#   define R500_OUT_FMT_C_2_10_10_10			(13 << 0)
+/* #define R500_OUT_FMT_RESERVED			(14 << 0) */
+#   define R500_OUT_FMT_UNUSED				(15 << 0)
+#   define R500_OUT_FMT_C_16_FP				(16 << 0)
+#   define R500_OUT_FMT_C2_16_FP			(17 << 0)
+#   define R500_OUT_FMT_C4_16_FP			(18 << 0)
+#   define R500_OUT_FMT_C_32_FP				(19 << 0)
+#   define R500_OUT_FMT_C2_32_FP			(20 << 0)
+#   define R500_OUT_FMT_C4_32_FP			(21 << 0)
+#   define R500_C0_SEL_A				(0 << 8)
+#   define R500_C0_SEL_R				(1 << 8)
+#   define R500_C0_SEL_G				(2 << 8)
+#   define R500_C0_SEL_B				(3 << 8)
+#   define R500_C1_SEL_A				(0 << 10)
+#   define R500_C1_SEL_R				(1 << 10)
+#   define R500_C1_SEL_G				(2 << 10)
+#   define R500_C1_SEL_B				(3 << 10)
+#   define R500_C2_SEL_A				(0 << 12)
+#   define R500_C2_SEL_R				(1 << 12)
+#   define R500_C2_SEL_G				(2 << 12)
+#   define R500_C2_SEL_B				(3 << 12)
+#   define R500_C3_SEL_A				(0 << 14)
+#   define R500_C3_SEL_R				(1 << 14)
+#   define R500_C3_SEL_G				(2 << 14)
+#   define R500_C3_SEL_B				(3 << 14)
+#   define R500_OUT_SIGN(x)				(x << 16)
+#   define R500_ROUND_ADJ				(1 << 20)
+#define R500_US_PIXSIZE					0x4604
+#   define R500_PIX_SIZE(x)				(x)
+#define R500_US_TEX_ADDR_0				0x9800
+#   define R500_TEX_SRC_ADDR(x)				(x << 0)
+#   define R500_TEX_SRC_ADDR_REL			(1 << 7)
+#   define R500_TEX_SRC_S_SWIZ_R			(0 << 8)
+#   define R500_TEX_SRC_S_SWIZ_G			(1 << 8)
+#   define R500_TEX_SRC_S_SWIZ_B			(2 << 8)
+#   define R500_TEX_SRC_S_SWIZ_A			(3 << 8)
+#   define R500_TEX_SRC_T_SWIZ_R			(0 << 10)
+#   define R500_TEX_SRC_T_SWIZ_G			(1 << 10)
+#   define R500_TEX_SRC_T_SWIZ_B			(2 << 10)
+#   define R500_TEX_SRC_T_SWIZ_A			(3 << 10)
+#   define R500_TEX_SRC_R_SWIZ_R			(0 << 12)
+#   define R500_TEX_SRC_R_SWIZ_G			(1 << 12)
+#   define R500_TEX_SRC_R_SWIZ_B			(2 << 12)
+#   define R500_TEX_SRC_R_SWIZ_A			(3 << 12)
+#   define R500_TEX_SRC_Q_SWIZ_R			(0 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_G			(1 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_B			(2 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_A			(3 << 14)
+#   define R500_TEX_DST_ADDR(x)				(x << 16)
+#   define R500_TEX_DST_ADDR_REL			(1 << 23)
+#   define R500_TEX_DST_R_SWIZ_R			(0 << 24)
+#   define R500_TEX_DST_R_SWIZ_G			(1 << 24)
+#   define R500_TEX_DST_R_SWIZ_B			(2 << 24)
+#   define R500_TEX_DST_R_SWIZ_A			(3 << 24)
+#   define R500_TEX_DST_G_SWIZ_R			(0 << 26)
+#   define R500_TEX_DST_G_SWIZ_G			(1 << 26)
+#   define R500_TEX_DST_G_SWIZ_B			(2 << 26)
+#   define R500_TEX_DST_G_SWIZ_A			(3 << 26)
+#   define R500_TEX_DST_B_SWIZ_R			(0 << 28)
+#   define R500_TEX_DST_B_SWIZ_G			(1 << 28)
+#   define R500_TEX_DST_B_SWIZ_B			(2 << 28)
+#   define R500_TEX_DST_B_SWIZ_A			(3 << 28)
+#   define R500_TEX_DST_A_SWIZ_R			(0 << 30)
+#   define R500_TEX_DST_A_SWIZ_G			(1 << 30)
+#   define R500_TEX_DST_A_SWIZ_B			(2 << 30)
+#   define R500_TEX_DST_A_SWIZ_A			(3 << 30)
+#define R500_US_TEX_ADDR_DXDY_0				0xa000
+#   define R500_DX_ADDR(x)				(x << 0)
+#   define R500_DX_ADDR_REL				(1 << 7)
+#   define R500_DX_S_SWIZ_R				(0 << 8)
+#   define R500_DX_S_SWIZ_G				(1 << 8)
+#   define R500_DX_S_SWIZ_B				(2 << 8)
+#   define R500_DX_S_SWIZ_A				(3 << 8)
+#   define R500_DX_T_SWIZ_R				(0 << 10)
+#   define R500_DX_T_SWIZ_G				(1 << 10)
+#   define R500_DX_T_SWIZ_B				(2 << 10)
+#   define R500_DX_T_SWIZ_A				(3 << 10)
+#   define R500_DX_R_SWIZ_R				(0 << 12)
+#   define R500_DX_R_SWIZ_G				(1 << 12)
+#   define R500_DX_R_SWIZ_B				(2 << 12)
+#   define R500_DX_R_SWIZ_A				(3 << 12)
+#   define R500_DX_Q_SWIZ_R				(0 << 14)
+#   define R500_DX_Q_SWIZ_G				(1 << 14)
+#   define R500_DX_Q_SWIZ_B				(2 << 14)
+#   define R500_DX_Q_SWIZ_A				(3 << 14)
+#   define R500_DY_ADDR(x)				(x << 16)
+#   define R500_DY_ADDR_REL				(1 << 17)
+#   define R500_DY_S_SWIZ_R				(0 << 24)
+#   define R500_DY_S_SWIZ_G				(1 << 24)
+#   define R500_DY_S_SWIZ_B				(2 << 24)
+#   define R500_DY_S_SWIZ_A				(3 << 24)
+#   define R500_DY_T_SWIZ_R				(0 << 26)
+#   define R500_DY_T_SWIZ_G				(1 << 26)
+#   define R500_DY_T_SWIZ_B				(2 << 26)
+#   define R500_DY_T_SWIZ_A				(3 << 26)
+#   define R500_DY_R_SWIZ_R				(0 << 28)
+#   define R500_DY_R_SWIZ_G				(1 << 28)
+#   define R500_DY_R_SWIZ_B				(2 << 28)
+#   define R500_DY_R_SWIZ_A				(3 << 28)
+#   define R500_DY_Q_SWIZ_R				(0 << 30)
+#   define R500_DY_Q_SWIZ_G				(1 << 30)
+#   define R500_DY_Q_SWIZ_B				(2 << 30)
+#   define R500_DY_Q_SWIZ_A				(3 << 30)
+#define R500_US_TEX_INST_0				0x9000
+#   define R500_TEX_ID(x)				(x << 16)
+#   define R500_TEX_INST_NOP				(0 << 22)
+#   define R500_TEX_INST_LD				(1 << 22)
+#   define R500_TEX_INST_TEXKILL			(2 << 22)
+#   define R500_TEX_INST_PROJ				(3 << 22)
+#   define R500_TEX_INST_LODBIAS			(4 << 22)
+#   define R500_TEX_INST_LOD				(5 << 22)
+#   define R500_TEX_INST_DXDY				(6 << 22)
+#   define R500_TEX_SEM_ACQUIRE				(1 << 25)
+#   define R500_TEX_IGNORE_UNCOVERED			(1 << 26)
+#   define R500_TEX_UNSCALED				(1 << 27)
+#define R300_US_W_FMT					0x46b4
+#   define R300_W_FMT_W0				(0 << 0)
+#   define R300_W_FMT_W24				(1 << 0)
+#   define R300_W_FMT_W24FP				(2 << 0)
+#   define R300_W_SRC_US				(0 << 2)
+#   define R300_W_SRC_RAS				(1 << 2)
+
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
+ * Two parameter dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ */
+#define R300_PACKET3_3D_DRAW_VBUF           0x00002800
+
+/* Draw a primitive from immediate vertices in this packet
+ * Up to 16382 dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ * 2 to end: Up to 16380 dwords of vertex data.
+ */
+#define R300_PACKET3_3D_DRAW_IMMD           0x00002900
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR and
+ * immediate vertices in this packet
+ * Up to 16382 dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ * 2 to end: Up to 16380 dwords of vertex data.
+ */
+#define R300_PACKET3_3D_DRAW_INDX           0x00002A00
+
+
+/* Specify the full set of vertex arrays as (address, stride).
+ * The first parameter is the number of vertex arrays specified.
+ * The rest of the command is a variable length list of blocks, where
+ * each block is three dwords long and specifies two arrays.
+ * The first dword of a block is split into two words, the lower significant
+ * word refers to the first array, the more significant word to the second
+ * array in the block.
+ * The low byte of each word contains the size of an array entry in dwords,
+ * the high byte contains the stride of the array.
+ * The second dword of a block contains the pointer to the first array,
+ * the third dword of a block contains the pointer to the second array.
+ * Note that if the total number of arrays is odd, the third dword of
+ * the last block is omitted.
+ */
+#define R300_PACKET3_3D_LOAD_VBPNTR         0x00002F00
+
+#define R300_PACKET3_INDX_BUFFER            0x00003300
+#    define R300_EB_UNK1_SHIFT                      24
+#    define R300_EB_UNK1                    (0x80<<24)
+#    define R300_EB_UNK2                        0x0810
+
+/* Same as R300_PACKET3_3D_DRAW_VBUF but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_VBUF_2         0x00003400
+/* Same as R300_PACKET3_3D_DRAW_IMMD but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_IMMD_2         0x00003500
+/* Same as R300_PACKET3_3D_DRAW_INDX but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_INDX_2         0x00003600
+
+/* Clears a portion of hierachical Z RAM
+ * 3 dword parameters
+ * 0. START
+ * 1. COUNT: 13:0 (max is 0x3FFF)
+ * 2. CLEAR_VALUE: Value to write into HIZ RAM.
+ */
+#define R300_PACKET3_3D_CLEAR_HIZ           0x00003700
+
+/* Draws a set of primitives using vertex buffers pointed by the state data.
+ * At least 2 Parameters:
+ * 0. VAP_VF_CNTL: The first parameter is a standard primitive emission dword.
+ * 2 to end: Data or indices (see other 3D_DRAW_* packets for details)
+ */
+#define R300_PACKET3_3D_DRAW_128            0x00003900
+
+/* END: Packet 3 commands */
+
+
+/* Color formats for 2d packets
+ */
+#define R300_CP_COLOR_FORMAT_CI8	2
+#define R300_CP_COLOR_FORMAT_ARGB1555	3
+#define R300_CP_COLOR_FORMAT_RGB565	4
+#define R300_CP_COLOR_FORMAT_ARGB8888	6
+#define R300_CP_COLOR_FORMAT_RGB332	7
+#define R300_CP_COLOR_FORMAT_RGB8	9
+#define R300_CP_COLOR_FORMAT_ARGB4444	15
+
+/*
+ * CP type-3 packets
+ */
+#define R300_CP_CMD_BITBLT_MULTI	0xC0009B00
+
+/* XXX Corbin's stuff from radeon and r200 */
+
+#define RADEON_WAIT_UNTIL                   0x1720
+#       define RADEON_WAIT_CRTC_PFLIP       (1 << 0)
+#       define RADEON_WAIT_2D_IDLECLEAN     (1 << 16)
+#       define RADEON_WAIT_3D_IDLECLEAN     (1 << 17)
+#       define RADEON_WAIT_HOST_IDLECLEAN   (1 << 18)
+
+#define RADEON_CP_PACKET3                           0xC0000000
+
+#define R200_3D_DRAW_IMMD_2      0xC0003500
+
+#endif /* _R300_REG_H */
+
+/* *INDENT-ON* */
+
+/* vim: set foldenable foldmarker=\\{,\\} foldmethod=marker : */
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
new file mode 100644
index 0000000000..8ed66a1660
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_screen.h"
+
+/* Return the identifier behind whom the brave coders responsible for this
+ * amalgamation of code, sweat, and duct tape, routinely obscure their names.
+ *
+ * ...I should have just put "Corbin Simpson", but I'm not that cool.
+ *
+ * (Or egotistical. Yet.) */
+static const char* r300_get_vendor(struct pipe_screen* pscreen)
+{
+    return "X.Org R300 Project";
+}
+
+static const char* chip_families[] = {
+    "R300",
+    "R350",
+    "R360",
+    "RV350",
+    "RV370",
+    "RV380",
+    "R420",
+    "R423",
+    "R430",
+    "R480",
+    "R481",
+    "RV410",
+    "RS400",
+    "RC410",
+    "RS480",
+    "RS482",
+    "RS690",
+    "RS740",
+    "RV515",
+    "R520",
+    "RV530",
+    "R580",
+    "RV560",
+    "RV570"
+};
+
+static const char* r300_get_name(struct pipe_screen* pscreen)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+
+    return chip_families[r300screen->caps->family];
+}
+
+static int r300_get_param(struct pipe_screen* pscreen, int param)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+
+    switch (param) {
+        /* XXX cases marked "IN THEORY" are possible on the hardware,
+         * but haven't been implemented yet. */
+        case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+            /* XXX I'm told this goes up to 16 */
+            return 8;
+        case PIPE_CAP_NPOT_TEXTURES:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_TWO_SIDED_STENCIL:
+            if (r300screen->caps->is_r500) {
+                return 1;
+            } else {
+                return 0;
+            }
+            return 0;
+        case PIPE_CAP_GLSL:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_S3TC:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_ANISOTROPIC_FILTER:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_POINT_SPRITE:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_MAX_RENDER_TARGETS:
+            /* XXX 4 eventually */
+            return 1;
+        case PIPE_CAP_OCCLUSION_QUERY:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_TEXTURE_SHADOW_MAP:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+            if (r300screen->caps->is_r500) {
+                /* 13 == 4096x4096 */
+                return 13;
+            } else {
+                /* 12 == 2048x2048 */
+                return 12;
+            }
+        case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+            /* So, technically, the limit is the same as above, but some math
+             * shows why this is silly. Assuming RGBA, 4cpp, we can see that
+             * 4096*4096*4096 = 64.0 GiB exactly, so it's not exactly
+             * practical. However, if at some point a game really wants this,
+             * then we can remove this limit. */
+            if (r300screen->caps->is_r500) {
+                /* 9 == 256x256x256 */
+                return 9;
+            } else {
+                /* 8 == 128*128*128 */
+                return 8;
+            }
+        case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+            if (r300screen->caps->is_r500) {
+                /* 13 == 4096x4096 */
+                return 13;
+            } else {
+                /* 12 == 2048x2048 */
+                return 12;
+            }
+        case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+            return 1;
+        case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+            return 1;
+        case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+            /* XXX guessing */
+            return 2;
+        default:
+            debug_printf("r300: Implementation error: Bad param %d\n",
+                param);
+            return 0;
+    }
+}
+
+static float r300_get_paramf(struct pipe_screen* pscreen, int param)
+{
+    switch (param) {
+        case PIPE_CAP_MAX_LINE_WIDTH:
+        case PIPE_CAP_MAX_LINE_WIDTH_AA:
+            /* XXX this is the biggest thing that will fit in that register.
+            * Perhaps the actual rendering limits are less? */
+            return 10922.0f;
+        case PIPE_CAP_MAX_POINT_WIDTH:
+        case PIPE_CAP_MAX_POINT_WIDTH_AA:
+            /* XXX this is the biggest thing that will fit in that register.
+             * Perhaps the actual rendering limits are less? */
+            return 10922.0f;
+        case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+            return 16.0f;
+        case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+            return 16.0f;
+        default:
+            debug_printf("r300: Implementation error: Bad paramf %d\n",
+                param);
+            return 0.0f;
+    }
+}
+
+/* XXX moar formats */
+static boolean check_tex_2d_format(enum pipe_format format)
+{
+    switch (format) {
+        case PIPE_FORMAT_A8R8G8B8_UNORM:
+        case PIPE_FORMAT_I8_UNORM:
+            return TRUE;
+        default:
+            debug_printf("r300: Warning: Got unknown format: %s, in %s\n",
+                pf_name(format), __FUNCTION__);
+            break;
+    }
+
+    return FALSE;
+}
+
+/* XXX moar targets */
+static boolean r300_is_format_supported(struct pipe_screen* pscreen,
+                                        enum pipe_format format,
+                                        enum pipe_texture_target target,
+                                        unsigned tex_usage,
+                                        unsigned geom_flags)
+{
+    switch (target) {
+        case PIPE_TEXTURE_2D:
+            return check_tex_2d_format(format);
+        default:
+            debug_printf("r300: Warning: Got unknown format target: %d\n",
+                format);
+            break;
+    }
+
+    return FALSE;
+}
+
+static void* r300_surface_map(struct pipe_screen* screen,
+                              struct pipe_surface* surface,
+                              unsigned flags)
+{
+    struct r300_texture* tex = (struct r300_texture*)surface->texture;
+    char* map = pipe_buffer_map(screen, tex->buffer, flags);
+
+    if (!map) {
+        return NULL;
+    }
+
+    return map + surface->offset;
+}
+
+static void r300_surface_unmap(struct pipe_screen* screen,
+                               struct pipe_surface* surface)
+{
+    struct r300_texture* tex = (struct r300_texture*)surface->texture;
+    pipe_buffer_unmap(screen, tex->buffer);
+}
+
+static void r300_destroy_screen(struct pipe_screen* pscreen)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+
+    FREE(r300screen->caps);
+    FREE(r300screen);
+}
+
+struct pipe_screen* r300_create_screen(struct pipe_winsys* winsys,
+                                       struct r300_winsys* r300_winsys)
+{
+    struct r300_screen* r300screen = CALLOC_STRUCT(r300_screen);
+    struct r300_capabilities* caps = CALLOC_STRUCT(r300_capabilities);
+
+    if (!r300screen || !caps)
+        return NULL;
+
+    caps->pci_id = r300_winsys->pci_id;
+    caps->num_frag_pipes = r300_winsys->gb_pipes;
+
+    r300_parse_chipset(caps);
+
+    r300screen->caps = caps;
+    r300screen->screen.winsys = winsys;
+    r300screen->screen.destroy = r300_destroy_screen;
+    r300screen->screen.get_name = r300_get_name;
+    r300screen->screen.get_vendor = r300_get_vendor;
+    r300screen->screen.get_param = r300_get_param;
+    r300screen->screen.get_paramf = r300_get_paramf;
+    r300screen->screen.is_format_supported = r300_is_format_supported;
+    r300screen->screen.surface_map = r300_surface_map;
+    r300screen->screen.surface_unmap = r300_surface_unmap;
+
+    r300_init_screen_texture_functions(&r300screen->screen);
+    u_simple_screen_init(&r300screen->screen);
+
+    return &r300screen->screen;
+}
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
new file mode 100644
index 0000000000..2e25f61dbf
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_SCREEN_H
+#define R300_SCREEN_H
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+#include "util/u_simple_screen.h"
+
+#include "r300_chipset.h"
+#include "r300_texture.h"
+#include "r300_winsys.h"
+
+struct r300_screen {
+    /* Parent class */
+    struct pipe_screen screen;
+
+    /* Chipset capabilities */
+    struct r300_capabilities* caps;
+};
+
+/* Convenience cast wrapper. */
+static struct r300_screen* r300_screen(struct pipe_screen* screen) {
+    return (struct r300_screen*)screen;
+}
+
+/* Creates a new r300 screen. */
+struct pipe_screen* r300_create_screen(struct pipe_winsys* winsys,
+                                       struct r300_winsys* r300_winsys);
+
+#endif /* R300_SCREEN_H */
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
new file mode 100644
index 0000000000..9392d72342
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -0,0 +1,826 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "util/u_math.h"
+#include "util/u_pack_color.h"
+#include "pipe/p_debug.h"
+
+#include "r300_context.h"
+#include "r300_reg.h"
+
+/* r300_state: Functions used to intialize state context by translating
+ * Gallium state objects into semi-native r300 state objects.
+ *
+ * XXX break this file up into pieces if it gets too big! */
+
+/* Pack a float into a dword. */
+static uint32_t pack_float_32(float f)
+{
+    union {
+        float f;
+        uint32_t u;
+    } u;
+
+    u.f = f;
+    return u.u;
+}
+
+static uint32_t translate_blend_function(int blend_func) {
+    switch (blend_func) {
+        case PIPE_BLEND_ADD:
+            return R300_COMB_FCN_ADD_CLAMP;
+        case PIPE_BLEND_SUBTRACT:
+            return R300_COMB_FCN_SUB_CLAMP;
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+            return R300_COMB_FCN_RSUB_CLAMP;
+        case PIPE_BLEND_MIN:
+            return R300_COMB_FCN_MIN;
+        case PIPE_BLEND_MAX:
+            return R300_COMB_FCN_MAX;
+        default:
+            debug_printf("r300: Unknown blend function %d\n", blend_func);
+            break;
+    }
+    return 0;
+}
+
+/* XXX we can also offer the D3D versions of some of these... */
+static uint32_t translate_blend_factor(int blend_fact) {
+    switch (blend_fact) {
+        case PIPE_BLENDFACTOR_ONE:
+            return R300_BLEND_GL_ONE;
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+            return R300_BLEND_GL_SRC_COLOR;
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+            return R300_BLEND_GL_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+            return R300_BLEND_GL_DST_ALPHA;
+        case PIPE_BLENDFACTOR_DST_COLOR:
+            return R300_BLEND_GL_DST_COLOR;
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+            return R300_BLEND_GL_SRC_ALPHA_SATURATE;
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+            return R300_BLEND_GL_CONST_COLOR;
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+            return R300_BLEND_GL_CONST_ALPHA;
+        /* XXX WTF are these?
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA: */
+        case PIPE_BLENDFACTOR_ZERO:
+            return R300_BLEND_GL_ZERO;
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+            return R300_BLEND_GL_ONE_MINUS_SRC_COLOR;
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+            return R300_BLEND_GL_ONE_MINUS_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+            return R300_BLEND_GL_ONE_MINUS_DST_ALPHA;
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+            return R300_BLEND_GL_ONE_MINUS_DST_COLOR;
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+            return R300_BLEND_GL_ONE_MINUS_CONST_COLOR;
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+            return R300_BLEND_GL_ONE_MINUS_CONST_ALPHA;
+        /* XXX see above
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: */
+        default:
+            debug_printf("r300: Unknown blend factor %d\n", blend_fact);
+            break;
+    }
+    return 0;
+}
+
+/* Create a new blend state based on the CSO blend state.
+ *
+ * This encompasses alpha blending, logic/raster ops, and blend dithering. */
+static void* r300_create_blend_state(struct pipe_context* pipe,
+                                     const struct pipe_blend_state* state)
+{
+    struct r300_blend_state* blend = CALLOC_STRUCT(r300_blend_state);
+
+    if (state->blend_enable) {
+        /* XXX for now, always do separate alpha...
+         * is it faster to do it with one reg? */
+        blend->blend_control = R300_ALPHA_BLEND_ENABLE |
+                R300_SEPARATE_ALPHA_ENABLE |
+                R300_READ_ENABLE |
+                translate_blend_function(state->rgb_func) |
+                (translate_blend_factor(state->rgb_src_factor) <<
+                    R300_SRC_BLEND_SHIFT) |
+                (translate_blend_factor(state->rgb_dst_factor) <<
+                    R300_DST_BLEND_SHIFT);
+        blend->alpha_blend_control =
+                translate_blend_function(state->alpha_func) |
+                (translate_blend_factor(state->alpha_src_factor) <<
+                    R300_SRC_BLEND_SHIFT) |
+                (translate_blend_factor(state->alpha_dst_factor) <<
+                    R300_DST_BLEND_SHIFT);
+    }
+
+    /* PIPE_LOGICOP_* don't need to be translated, fortunately. */
+    /* XXX are logicops still allowed if blending's disabled?
+     * Does Gallium take care of it for us? */
+    if (state->logicop_enable) {
+        blend->rop = R300_RB3D_ROPCNTL_ROP_ENABLE |
+                (state->logicop_func) << R300_RB3D_ROPCNTL_ROP_SHIFT;
+    }
+
+    if (state->dither) {
+        blend->dither = R300_RB3D_DITHER_CTL_DITHER_MODE_LUT |
+                R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_LUT;
+    }
+
+    return (void*)blend;
+}
+
+/* Bind blend state. */
+static void r300_bind_blend_state(struct pipe_context* pipe,
+                                  void* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    r300->blend_state = (struct r300_blend_state*)state;
+    r300->dirty_state |= R300_NEW_BLEND;
+}
+
+/* Free blend state. */
+static void r300_delete_blend_state(struct pipe_context* pipe,
+                                    void* state)
+{
+    FREE(state);
+}
+
+/* Set blend color.
+ * Setup both R300 and R500 registers, figure out later which one to write. */
+static void r300_set_blend_color(struct pipe_context* pipe,
+                                 const struct pipe_blend_color* color)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    uint32_t r, g, b, a;
+    ubyte ur, ug, ub, ua;
+
+    r = util_iround(color->color[0] * 1023.0f);
+    g = util_iround(color->color[1] * 1023.0f);
+    b = util_iround(color->color[2] * 1023.0f);
+    a = util_iround(color->color[3] * 1023.0f);
+
+    ur = float_to_ubyte(color->color[0]);
+    ug = float_to_ubyte(color->color[1]);
+    ub = float_to_ubyte(color->color[2]);
+    ua = float_to_ubyte(color->color[3]);
+
+    r300->blend_color_state->blend_color = (a << 24) | (r << 16) | (g << 8) | b;
+
+    r300->blend_color_state->blend_color_red_alpha = ur | (ua << 16);
+    r300->blend_color_state->blend_color_green_blue = ub | (ug << 16);
+
+    r300->dirty_state |= R300_NEW_BLEND_COLOR;
+}
+
+static void r300_set_clip_state(struct pipe_context* pipe,
+                                const struct pipe_clip_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    /* XXX Draw */
+    draw_flush(r300->draw);
+    draw_set_clip_state(r300->draw, state);
+}
+
+static void
+    r300_set_constant_buffer(struct pipe_context* pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer* buffer)
+{
+    /* XXX */
+}
+
+static uint32_t translate_depth_stencil_function(int zs_func) {
+    switch (zs_func) {
+        case PIPE_FUNC_NEVER:
+            return R300_ZS_NEVER;
+        case PIPE_FUNC_LESS:
+            return R300_ZS_LESS;
+        case PIPE_FUNC_EQUAL:
+            return R300_ZS_EQUAL;
+        case PIPE_FUNC_LEQUAL:
+            return R300_ZS_LEQUAL;
+        case PIPE_FUNC_GREATER:
+            return R300_ZS_GREATER;
+        case PIPE_FUNC_NOTEQUAL:
+            return R300_ZS_NOTEQUAL;
+        case PIPE_FUNC_GEQUAL:
+            return R300_ZS_GEQUAL;
+        case PIPE_FUNC_ALWAYS:
+            return R300_ZS_ALWAYS;
+        default:
+            debug_printf("r300: Unknown depth/stencil function %d\n",
+                zs_func);
+            break;
+    }
+    return 0;
+}
+
+static uint32_t translate_stencil_op(int s_op) {
+    switch (s_op) {
+        case PIPE_STENCIL_OP_KEEP:
+            return R300_ZS_KEEP;
+        case PIPE_STENCIL_OP_ZERO:
+            return R300_ZS_ZERO;
+        case PIPE_STENCIL_OP_REPLACE:
+            return R300_ZS_REPLACE;
+        case PIPE_STENCIL_OP_INCR:
+            return R300_ZS_INCR;
+        case PIPE_STENCIL_OP_DECR:
+            return R300_ZS_DECR;
+        case PIPE_STENCIL_OP_INCR_WRAP:
+            return R300_ZS_INCR_WRAP;
+        case PIPE_STENCIL_OP_DECR_WRAP:
+            return R300_ZS_DECR_WRAP;
+        case PIPE_STENCIL_OP_INVERT:
+            return R300_ZS_INVERT;
+        default:
+            debug_printf("r300: Unknown stencil op %d", s_op);
+            break;
+    }
+    return 0;
+}
+
+static uint32_t translate_alpha_function(int alpha_func) {
+    switch (alpha_func) {
+        case PIPE_FUNC_NEVER:
+            return R300_FG_ALPHA_FUNC_NEVER;
+        case PIPE_FUNC_LESS:
+            return R300_FG_ALPHA_FUNC_LESS;
+        case PIPE_FUNC_EQUAL:
+            return R300_FG_ALPHA_FUNC_EQUAL;
+        case PIPE_FUNC_LEQUAL:
+            return R300_FG_ALPHA_FUNC_LE;
+        case PIPE_FUNC_GREATER:
+            return R300_FG_ALPHA_FUNC_GREATER;
+        case PIPE_FUNC_NOTEQUAL:
+            return R300_FG_ALPHA_FUNC_NOTEQUAL;
+        case PIPE_FUNC_GEQUAL:
+            return R300_FG_ALPHA_FUNC_GE;
+        case PIPE_FUNC_ALWAYS:
+            return R300_FG_ALPHA_FUNC_ALWAYS;
+        default:
+            debug_printf("r300: Unknown alpha function %d", alpha_func);
+            break;
+    }
+    return 0;
+}
+
+/* Create a new depth, stencil, and alpha state based on the CSO dsa state.
+ *
+ * This contains the depth buffer, stencil buffer, alpha test, and such.
+ * On the Radeon, depth and stencil buffer setup are intertwined, which is
+ * the reason for some of the strange-looking assignments across registers. */
+static void*
+        r300_create_dsa_state(struct pipe_context* pipe,
+                              const struct pipe_depth_stencil_alpha_state* state)
+{
+    struct r300_dsa_state* dsa = CALLOC_STRUCT(r300_dsa_state);
+
+    /* Depth test setup. */
+    if (state->depth.enabled) {
+        dsa->z_buffer_control |= R300_Z_ENABLE;
+
+        if (state->depth.writemask) {
+            dsa->z_buffer_control |= R300_Z_WRITE_ENABLE;
+        }
+
+        dsa->z_stencil_control |=
+            (translate_depth_stencil_function(state->depth.func) <<
+                R300_Z_FUNC_SHIFT);
+    }
+
+    /* Stencil buffer setup. */
+    if (state->stencil[0].enabled) {
+        dsa->z_buffer_control |= R300_STENCIL_ENABLE;
+        dsa->z_stencil_control |=
+                (translate_depth_stencil_function(state->stencil[0].func) <<
+                    R300_S_FRONT_FUNC_SHIFT) |
+                (translate_stencil_op(state->stencil[0].fail_op) <<
+                    R300_S_FRONT_SFAIL_OP_SHIFT) |
+                (translate_stencil_op(state->stencil[0].zpass_op) <<
+                    R300_S_FRONT_ZPASS_OP_SHIFT) |
+                (translate_stencil_op(state->stencil[0].zfail_op) <<
+                    R300_S_FRONT_ZFAIL_OP_SHIFT);
+
+        dsa->stencil_ref_mask = (state->stencil[0].ref_value) |
+                (state->stencil[0].valuemask << R300_STENCILMASK_SHIFT) |
+                (state->stencil[0].writemask << R300_STENCILWRITEMASK_SHIFT);
+
+        if (state->stencil[1].enabled) {
+            dsa->z_buffer_control |= R300_STENCIL_FRONT_BACK;
+            dsa->z_stencil_control |=
+                (translate_depth_stencil_function(state->stencil[1].func) <<
+                    R300_S_BACK_FUNC_SHIFT) |
+                (translate_stencil_op(state->stencil[1].fail_op) <<
+                    R300_S_BACK_SFAIL_OP_SHIFT) |
+                (translate_stencil_op(state->stencil[1].zpass_op) <<
+                    R300_S_BACK_ZPASS_OP_SHIFT) |
+                (translate_stencil_op(state->stencil[1].zfail_op) <<
+                    R300_S_BACK_ZFAIL_OP_SHIFT);
+
+            dsa->stencil_ref_bf = (state->stencil[1].ref_value) |
+                (state->stencil[1].valuemask << R300_STENCILMASK_SHIFT) |
+                (state->stencil[1].writemask << R300_STENCILWRITEMASK_SHIFT);
+        }
+    }
+
+    /* Alpha test setup. */
+    if (state->alpha.enabled) {
+        dsa->alpha_function = translate_alpha_function(state->alpha.func) |
+            R300_FG_ALPHA_FUNC_ENABLE;
+        dsa->alpha_reference = CLAMP(state->alpha.ref_value * 1023.0f,
+                                     0, 1023);
+    } else {
+        dsa->z_buffer_top = R300_ZTOP_ENABLE;
+    }
+
+    return (void*)dsa;
+}
+
+/* Bind DSA state. */
+static void r300_bind_dsa_state(struct pipe_context* pipe,
+                                void* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    r300->dsa_state = (struct r300_dsa_state*)state;
+    r300->dirty_state |= R300_NEW_DSA;
+}
+
+/* Free DSA state. */
+static void r300_delete_dsa_state(struct pipe_context* pipe,
+                                  void* state)
+{
+    FREE(state);
+}
+
+static void r300_set_edgeflags(struct pipe_context* pipe,
+                               const unsigned* bitfield)
+{
+    /* XXX you know it's bad when i915 has this blank too */
+}
+
+static void
+    r300_set_framebuffer_state(struct pipe_context* pipe,
+                               const struct pipe_framebuffer_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    draw_flush(r300->draw);
+
+    r300->framebuffer_state = *state;
+
+    r300->dirty_state |= R300_NEW_FRAMEBUFFERS;
+}
+
+/* Create fragment shader state. */
+static void* r300_create_fs_state(struct pipe_context* pipe,
+                                  const struct pipe_shader_state* shader)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r3xx_fragment_shader* fs = NULL;
+
+    if (r300_screen(r300->context.screen)->caps->is_r500) {
+        fs =
+            (struct r3xx_fragment_shader*)CALLOC_STRUCT(r500_fragment_shader);
+    } else {
+        fs =
+            (struct r3xx_fragment_shader*)CALLOC_STRUCT(r300_fragment_shader);
+    }
+
+    /* Copy state directly into shader. */
+    fs->state = *shader;
+
+    return (void*)fs;
+}
+
+/* Bind fragment shader state. */
+static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r3xx_fragment_shader* fs = (struct r3xx_fragment_shader*)shader;
+
+    if (!fs->translated) {
+        if (r300_screen(r300->context.screen)->caps->is_r500) {
+            r500_translate_shader(r300, fs);
+        } else {
+            r300_translate_shader(r300, fs);
+        }
+    }
+
+    r300->fs = fs;
+
+    r300->dirty_state |= R300_NEW_FRAGMENT_SHADER;
+}
+
+/* Delete fragment shader state. */
+static void r300_delete_fs_state(struct pipe_context* pipe, void* shader)
+{
+    FREE(shader);
+}
+
+static void r300_set_polygon_stipple(struct pipe_context* pipe,
+                                     const struct pipe_poly_stipple* state)
+{
+    /* XXX */
+}
+
+static INLINE int pack_float_16_6x(float f) {
+    return ((int)(f * 6.0) & 0xffff);
+}
+
+/* Create a new rasterizer state based on the CSO rasterizer state.
+ *
+ * This is a very large chunk of state, and covers most of the graphics
+ * backend (GB), geometry assembly (GA), and setup unit (SU) blocks.
+ *
+ * In a not entirely unironic sidenote, this state has nearly nothing to do
+ * with the actual block on the Radeon called the rasterizer (RS). */
+static void* r300_create_rs_state(struct pipe_context* pipe,
+                                  const struct pipe_rasterizer_state* state)
+{
+    struct r300_rs_state* rs = CALLOC_STRUCT(r300_rs_state);
+
+    /* XXX this is part of HW TCL */
+    /* XXX endian control */
+    rs->vap_control_status = R300_VAP_TCL_BYPASS;
+
+    rs->point_size = pack_float_16_6x(state->point_size) |
+        (pack_float_16_6x(state->point_size) << R300_POINTSIZE_X_SHIFT);
+
+    rs->line_control = pack_float_16_6x(state->line_width) |
+        R300_GA_LINE_CNTL_END_TYPE_COMP;
+
+    /* Radeons don't think in "CW/CCW", they think in "front/back". */
+    if (state->front_winding == PIPE_WINDING_CW) {
+        rs->cull_mode = R300_FRONT_FACE_CW;
+
+        if (state->offset_cw) {
+            rs->polygon_offset_enable |= R300_FRONT_ENABLE;
+        }
+        if (state->offset_ccw) {
+            rs->polygon_offset_enable |= R300_BACK_ENABLE;
+        }
+    } else {
+        rs->cull_mode = R300_FRONT_FACE_CCW;
+
+        if (state->offset_ccw) {
+            rs->polygon_offset_enable |= R300_FRONT_ENABLE;
+        }
+        if (state->offset_cw) {
+            rs->polygon_offset_enable |= R300_BACK_ENABLE;
+        }
+    }
+    if (state->front_winding & state->cull_mode) {
+        rs->cull_mode |= R300_CULL_FRONT;
+    }
+    if (~(state->front_winding) & state->cull_mode) {
+        rs->cull_mode |= R300_CULL_BACK;
+    }
+
+    if (rs->polygon_offset_enable) {
+        rs->depth_offset_front = rs->depth_offset_back =
+                pack_float_32(state->offset_units);
+        rs->depth_scale_front = rs->depth_scale_back =
+                pack_float_32(state->offset_scale);
+    }
+
+    if (state->line_stipple_enable) {
+        rs->line_stipple_config =
+            R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_LINE |
+            (pack_float_32((float)state->line_stipple_factor) &
+                R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_MASK);
+        /* XXX this might need to be scaled up */
+        rs->line_stipple_value = state->line_stipple_pattern;
+    }
+
+    return (void*)rs;
+}
+
+/* Bind rasterizer state. */
+static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    r300->rs_state = (struct r300_rs_state*)state;
+    r300->dirty_state |= R300_NEW_RASTERIZER;
+}
+
+/* Free rasterizer state. */
+static void r300_delete_rs_state(struct pipe_context* pipe, void* state)
+{
+    FREE(state);
+}
+
+static uint32_t translate_wrap(int wrap) {
+    switch (wrap) {
+        case PIPE_TEX_WRAP_REPEAT:
+            return R300_TX_REPEAT;
+        case PIPE_TEX_WRAP_CLAMP:
+            return R300_TX_CLAMP;
+        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+            return R300_TX_CLAMP_TO_EDGE;
+        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+            return R300_TX_CLAMP_TO_BORDER;
+        case PIPE_TEX_WRAP_MIRROR_REPEAT:
+            return R300_TX_REPEAT | R300_TX_MIRRORED;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP:
+            return R300_TX_CLAMP | R300_TX_MIRRORED;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+            return R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+            return R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+        default:
+            debug_printf("r300: Unknown texture wrap %d", wrap);
+            return 0;
+    }
+}
+
+static uint32_t translate_tex_filters(int min, int mag, int mip) {
+    uint32_t retval = 0;
+    switch (min) {
+        case PIPE_TEX_FILTER_NEAREST:
+            retval |= R300_TX_MIN_FILTER_NEAREST;
+        case PIPE_TEX_FILTER_LINEAR:
+            retval |= R300_TX_MIN_FILTER_LINEAR;
+        case PIPE_TEX_FILTER_ANISO:
+            retval |= R300_TX_MIN_FILTER_ANISO;
+        default:
+            debug_printf("r300: Unknown texture filter %d", min);
+            break;
+    }
+    switch (mag) {
+        case PIPE_TEX_FILTER_NEAREST:
+            retval |= R300_TX_MAG_FILTER_NEAREST;
+        case PIPE_TEX_FILTER_LINEAR:
+            retval |= R300_TX_MAG_FILTER_LINEAR;
+        case PIPE_TEX_FILTER_ANISO:
+            retval |= R300_TX_MAG_FILTER_ANISO;
+        default:
+            debug_printf("r300: Unknown texture filter %d", mag);
+            break;
+    }
+    switch (mip) {
+        case PIPE_TEX_MIPFILTER_NONE:
+            retval |= R300_TX_MIN_FILTER_MIP_NONE;
+        case PIPE_TEX_MIPFILTER_NEAREST:
+            retval |= R300_TX_MIN_FILTER_MIP_NEAREST;
+        case PIPE_TEX_MIPFILTER_LINEAR:
+            retval |= R300_TX_MIN_FILTER_MIP_LINEAR;
+        default:
+            debug_printf("r300: Unknown texture filter %d", mip);
+            break;
+    }
+
+    return retval;
+}
+
+static uint32_t anisotropy(float max_aniso) {
+    if (max_aniso >= 16.0f) {
+        return R300_TX_MAX_ANISO_16_TO_1;
+    } else if (max_aniso >= 8.0f) {
+        return R300_TX_MAX_ANISO_8_TO_1;
+    } else if (max_aniso >= 4.0f) {
+        return R300_TX_MAX_ANISO_4_TO_1;
+    } else if (max_aniso >= 2.0f) {
+        return R300_TX_MAX_ANISO_2_TO_1;
+    } else {
+        return R300_TX_MAX_ANISO_1_TO_1;
+    }
+}
+
+static void*
+        r300_create_sampler_state(struct pipe_context* pipe,
+                                  const struct pipe_sampler_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_sampler_state* sampler = CALLOC_STRUCT(r300_sampler_state);
+    int lod_bias;
+
+    sampler->filter0 |=
+        (translate_wrap(state->wrap_s) << R300_TX_WRAP_S_SHIFT) |
+        (translate_wrap(state->wrap_t) << R300_TX_WRAP_T_SHIFT) |
+        (translate_wrap(state->wrap_r) << R300_TX_WRAP_R_SHIFT);
+
+    sampler->filter0 |= translate_tex_filters(state->min_img_filter,
+                                              state->mag_img_filter,
+                                              state->min_mip_filter);
+
+    lod_bias = CLAMP((int)(state->lod_bias * 32), -(1 << 9), (1 << 9) - 1);
+
+    sampler->filter1 |= lod_bias << R300_LOD_BIAS_SHIFT;
+
+    sampler->filter1 |= anisotropy(state->max_anisotropy);
+
+    util_pack_color(state->border_color, PIPE_FORMAT_A8R8G8B8_UNORM,
+                    &sampler->border_color);
+
+    /* R500-specific fixups and optimizations */
+    if (r300_screen(r300->context.screen)->caps->is_r500) {
+        sampler->filter1 |= R500_BORDER_FIX;
+    }
+
+    return (void*)sampler;
+}
+
+static void r300_bind_sampler_states(struct pipe_context* pipe,
+                                     unsigned count,
+                                     void** states)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    int i;
+
+    if (count > 8) {
+        return;
+    }
+
+    for (i = 0; i < count; i++) {
+        if (r300->sampler_states[i] != states[i]) {
+            r300->sampler_states[i] = (struct r300_sampler_state*)states[i];
+            r300->dirty_state |= (R300_NEW_SAMPLER << i);
+        }
+    }
+
+    r300->sampler_count = count;
+}
+
+static void r300_delete_sampler_state(struct pipe_context* pipe, void* state)
+{
+    FREE(state);
+}
+
+static void r300_set_sampler_textures(struct pipe_context* pipe,
+                                      unsigned count,
+                                      struct pipe_texture** texture)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    int i;
+
+    /* XXX magic num */
+    if (count > 8) {
+        return;
+    }
+
+    for (i = 0; i < count; i++) {
+        if (r300->textures[i] != (struct r300_texture*)texture[i]) {
+            pipe_texture_reference((struct pipe_texture**)&r300->textures[i],
+                texture[i]);
+            r300->dirty_state |= (R300_NEW_TEXTURE << i);
+        }
+    }
+
+    for (i = count; i < 8; i++) {
+        if (r300->textures[i]) {
+            pipe_texture_reference((struct pipe_texture**)&r300->textures[i],
+                NULL);
+            r300->dirty_state |= (R300_NEW_TEXTURE << i);
+        }
+    }
+
+    r300->texture_count = count;
+}
+
+static void r300_set_scissor_state(struct pipe_context* pipe,
+                                   const struct pipe_scissor_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    draw_flush(r300->draw);
+
+    uint32_t left, top, right, bottom;
+
+    /* So, a bit of info. The scissors are offset by R300_SCISSORS_OFFSET in
+     * both directions for all values, and can only be 13 bits wide. Why?
+     * We may never know. */
+    left = (state->minx + R300_SCISSORS_OFFSET) & 0x1fff;
+    top = (state->miny + R300_SCISSORS_OFFSET) & 0x1fff;
+    right = (state->maxx + R300_SCISSORS_OFFSET) & 0x1fff;
+    bottom = (state->maxy + R300_SCISSORS_OFFSET) & 0x1fff;
+
+    r300->scissor_state->scissor_top_left = (left << R300_SCISSORS_X_SHIFT) |
+            (top << R300_SCISSORS_Y_SHIFT);
+    r300->scissor_state->scissor_bottom_right =
+        (right << R300_SCISSORS_X_SHIFT) | (bottom << R300_SCISSORS_Y_SHIFT);
+
+    r300->dirty_state |= R300_NEW_SCISSOR;
+}
+
+static void r300_set_viewport_state(struct pipe_context* pipe,
+                                    const struct pipe_viewport_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    /* XXX handing this off to Draw for now */
+    draw_set_viewport_state(r300->draw, state);
+}
+
+static void r300_set_vertex_buffers(struct pipe_context* pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_buffer* buffers)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    /* XXX Draw */
+    draw_flush(r300->draw);
+    draw_set_vertex_buffers(r300->draw, count, buffers);
+}
+
+static void r300_set_vertex_elements(struct pipe_context* pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_element* elements)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    /* XXX Draw */
+    draw_flush(r300->draw);
+    draw_set_vertex_elements(r300->draw, count, elements);
+}
+
+static void* r300_create_vs_state(struct pipe_context* pipe,
+                                  const struct pipe_shader_state* state)
+{
+    struct r300_context* context = r300_context(pipe);
+    /* XXX handing this off to Draw for now */
+    return draw_create_vertex_shader(context->draw, state);
+}
+
+static void r300_bind_vs_state(struct pipe_context* pipe, void* state) {
+    struct r300_context* context = r300_context(pipe);
+    /* XXX handing this off to Draw for now */
+    draw_bind_vertex_shader(context->draw, (struct draw_vertex_shader*)state);
+}
+
+static void r300_delete_vs_state(struct pipe_context* pipe, void* state)
+{
+    struct r300_context* context = r300_context(pipe);
+    /* XXX handing this off to Draw for now */
+    draw_delete_vertex_shader(context->draw, (struct draw_vertex_shader*)state);
+}
+
+void r300_init_state_functions(struct r300_context* r300)
+{
+    r300->context.create_blend_state = r300_create_blend_state;
+    r300->context.bind_blend_state = r300_bind_blend_state;
+    r300->context.delete_blend_state = r300_delete_blend_state;
+
+    r300->context.set_blend_color = r300_set_blend_color;
+
+    r300->context.set_clip_state = r300_set_clip_state;
+
+    r300->context.set_constant_buffer = r300_set_constant_buffer;
+
+    r300->context.create_depth_stencil_alpha_state = r300_create_dsa_state;
+    r300->context.bind_depth_stencil_alpha_state = r300_bind_dsa_state;
+    r300->context.delete_depth_stencil_alpha_state = r300_delete_dsa_state;
+
+    r300->context.set_edgeflags = r300_set_edgeflags;
+
+    r300->context.set_framebuffer_state = r300_set_framebuffer_state;
+
+    r300->context.create_fs_state = r300_create_fs_state;
+    r300->context.bind_fs_state = r300_bind_fs_state;
+    r300->context.delete_fs_state = r300_delete_fs_state;
+
+    r300->context.set_polygon_stipple = r300_set_polygon_stipple;
+
+    r300->context.create_rasterizer_state = r300_create_rs_state;
+    r300->context.bind_rasterizer_state = r300_bind_rs_state;
+    r300->context.delete_rasterizer_state = r300_delete_rs_state;
+
+    r300->context.create_sampler_state = r300_create_sampler_state;
+    r300->context.bind_sampler_states = r300_bind_sampler_states;
+    r300->context.delete_sampler_state = r300_delete_sampler_state;
+
+    r300->context.set_sampler_textures = r300_set_sampler_textures;
+
+    r300->context.set_scissor_state = r300_set_scissor_state;
+
+    r300->context.set_viewport_state = r300_set_viewport_state;
+
+    r300->context.set_vertex_buffers = r300_set_vertex_buffers;
+    r300->context.set_vertex_elements = r300_set_vertex_elements;
+
+    r300->context.create_vs_state = r300_create_vs_state;
+    r300->context.bind_vs_state = r300_bind_vs_state;
+    r300->context.delete_vs_state = r300_delete_vs_state;
+}
diff --git a/src/gallium/drivers/r300/r300_state_shader.c b/src/gallium/drivers/r300/r300_state_shader.c
new file mode 100644
index 0000000000..e87172128f
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_shader.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_state_shader.h"
+
+void r300_translate_shader(struct r300_context* r300,
+                           struct r300_fragment_shader* fs)
+{
+}
+
+void r500_translate_shader(struct r300_context* r300,
+                           struct r500_fragment_shader* fs)
+{
+}
diff --git a/src/gallium/drivers/r300/r300_state_shader.h b/src/gallium/drivers/r300/r300_state_shader.h
new file mode 100644
index 0000000000..a20bd4276c
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_shader.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_STATE_SHADER_H
+#define R300_STATE_SHADER_H
+
+#include "r300_context.h"
+#include "r300_screen.h"
+
+void r300_translate_shader(struct r300_context* r300,
+                           struct r300_fragment_shader* fs);
+
+void r500_translate_shader(struct r300_context* r300,
+                           struct r500_fragment_shader* fs);
+
+#endif /* R300_STATE_SHADER_H */
diff --git a/src/gallium/drivers/r300/r300_surface.c b/src/gallium/drivers/r300/r300_surface.c
new file mode 100644
index 0000000000..1e1f96a7f9
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_surface.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_surface.h"
+
+/* Provides pipe_context's "surface_fill". Commonly used for clearing
+ * buffers. */
+static void r300_surface_fill(struct pipe_context* pipe,
+                              struct pipe_surface* dest,
+                              unsigned x, unsigned y,
+                              unsigned w, unsigned h,
+                              unsigned color)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    CS_LOCALS(r300);
+    struct r300_capabilities* caps = ((struct r300_screen*)pipe->screen)->caps;
+    struct r300_texture* tex = (struct r300_texture*)dest->texture;
+    int i;
+    float r, g, b, a;
+    r = (float)((color >> 16) & 0xff) / 255.0f;
+    g = (float)((color >>  8) & 0xff) / 255.0f;
+    b = (float)((color >>  0) & 0xff) / 255.0f;
+    debug_printf("r300: Filling surface %p at (%d,%d),"
+        " dimensions %dx%d (stride %d), color 0x%x\n",
+        dest, x, y, w, h, dest->stride, color);
+
+    /* Fallback? */
+    if (0) {
+        debug_printf("r300: Falling back on surface clear...");
+        void* map = pipe->screen->surface_map(pipe->screen, dest,
+            PIPE_BUFFER_USAGE_CPU_WRITE);
+        pipe_fill_rect(map, &dest->block, &dest->stride, x, y, w, h, color);
+        pipe->screen->surface_unmap(pipe->screen, dest);
+        return;
+    }
+
+BEGIN_CS((caps->is_r500) ? 309 : 280);
+R300_PACIFY;
+OUT_CS_REG(R300_TX_INVALTAGS, 0x0);
+R300_PACIFY;
+/* Flush PVS. */
+OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
+
+OUT_CS_REG(R300_SE_VTE_CNTL, R300_VPORT_X_SCALE_ENA |
+    R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+    R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+    R300_VPORT_Z_OFFSET_ENA | R300_VTX_W0_FMT);
+/* Vertex size. */
+OUT_CS_REG(R300_VAP_VTX_SIZE, 0x8);
+/* Max and min vertex index clamp. */
+OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, 0xFFFFFF);
+OUT_CS_REG(R300_VAP_VF_MIN_VTX_INDX, 0x0);
+/* XXX endian */
+OUT_CS_REG(R300_VAP_CNTL_STATUS, R300_VC_NO_SWAP);
+OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0, 0x0);
+/* XXX magic number not in r300_reg */
+OUT_CS_REG(R300_VAP_PSC_SGN_NORM_CNTL, 0xAAAAAAAA);
+OUT_CS_REG(R300_VAP_CLIP_CNTL, 0x0);
+OUT_CS_REG_SEQ(R300_VAP_GB_VERT_CLIP_ADJ, 4);
+OUT_CS_32F(1.0);
+OUT_CS_32F(1.0);
+OUT_CS_32F(1.0);
+OUT_CS_32F(1.0);
+/* XXX is this too long? */
+OUT_CS_REG(VAP_PVS_VTX_TIMEOUT_REG, 0xFFFF);
+OUT_CS_REG(R300_GB_ENABLE, R300_GB_POINT_STUFF_ENABLE |
+    R300_GB_LINE_STUFF_ENABLE | R300_GB_TRIANGLE_STUFF_ENABLE);
+/* XXX more magic numbers */
+OUT_CS_REG(R300_GB_MSPOS0, 0x66666666);
+OUT_CS_REG(R300_GB_MSPOS1, 0x66666666);
+/* XXX why doesn't classic Mesa write the number of pipes, too? */
+OUT_CS_REG(R300_GB_TILE_CONFIG, R300_GB_TILE_ENABLE | R300_GB_TILE_SIZE_16);
+OUT_CS_REG(R300_GB_SELECT, R300_GB_FOG_SELECT_1_1_W);
+OUT_CS_REG(R300_GB_AA_CONFIG, 0x0);
+/* XXX point tex stuffing */
+OUT_CS_REG_SEQ(R300_GA_POINT_S0, 1);
+OUT_CS_32F(0.0);
+OUT_CS_REG_SEQ(R300_GA_POINT_S1, 1);
+OUT_CS_32F(1.0);
+OUT_CS_REG(R300_GA_TRIANGLE_STIPPLE, 0x5 |
+    (0x5 << R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_SHIFT));
+/* XXX should this be related to the actual point size? */
+OUT_CS_REG(R300_GA_POINT_MINMAX, 0x6 |
+    (0x1800 << R300_GA_POINT_MINMAX_MAX_SHIFT));
+/* XXX this big chunk should be refactored into rs_state */
+OUT_CS_REG(R300_GA_LINE_CNTL, 0x00030006);
+OUT_CS_REG(R300_GA_LINE_STIPPLE_CONFIG, 0x3BAAAAAB);
+OUT_CS_REG(R300_GA_LINE_STIPPLE_VALUE, 0x00000000);
+OUT_CS_REG(R300_GA_LINE_S0, 0x00000000);
+OUT_CS_REG(R300_GA_LINE_S1, 0x3F800000);
+OUT_CS_REG(R300_GA_ENHANCE, 0x00000002);
+OUT_CS_REG(R300_GA_COLOR_CONTROL, 0x0003AAAA);
+OUT_CS_REG(R300_GA_SOLID_RG, 0x00000000);
+OUT_CS_REG(R300_GA_SOLID_BA, 0x00000000);
+OUT_CS_REG(R300_GA_POLY_MODE, 0x00000000);
+OUT_CS_REG(R300_GA_ROUND_MODE, 0x00000001);
+OUT_CS_REG(R300_GA_OFFSET, 0x00000000);
+OUT_CS_REG(R300_GA_FOG_SCALE, 0x3DBF1412);
+OUT_CS_REG(R300_GA_FOG_OFFSET, 0x00000000);
+OUT_CS_REG(R300_SU_TEX_WRAP, 0x00000000);
+OUT_CS_REG(R300_SU_POLY_OFFSET_FRONT_SCALE, 0x00000000);
+OUT_CS_REG(R300_SU_POLY_OFFSET_FRONT_OFFSET, 0x00000000);
+OUT_CS_REG(R300_SU_POLY_OFFSET_BACK_SCALE, 0x00000000);
+OUT_CS_REG(R300_SU_POLY_OFFSET_BACK_OFFSET, 0x00000000);
+OUT_CS_REG(R300_SU_POLY_OFFSET_ENABLE, 0x00000000);
+OUT_CS_REG(R300_SU_CULL_MODE, 0x00000000);
+OUT_CS_REG(R300_SU_DEPTH_SCALE, 0x4B7FFFFF);
+OUT_CS_REG(R300_SU_DEPTH_OFFSET, 0x00000000);
+OUT_CS_REG(R300_SC_HYPERZ, 0x0000001C);
+OUT_CS_REG(R300_SC_EDGERULE, 0x2DA49525);
+OUT_CS_REG(R300_SC_SCREENDOOR, 0x00FFFFFF);
+OUT_CS_REG(R300_FG_FOG_BLEND, 0x00000002);
+OUT_CS_REG(R300_FG_FOG_COLOR_R, 0x00000000);
+OUT_CS_REG(R300_FG_FOG_COLOR_G, 0x00000000);
+OUT_CS_REG(R300_FG_FOG_COLOR_B, 0x00000000);
+OUT_CS_REG(R300_FG_DEPTH_SRC, 0x00000000);
+OUT_CS_REG(R300_FG_DEPTH_SRC, 0x00000000);
+OUT_CS_REG(R300_RB3D_CCTL, 0x00000000);
+OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0x0000000F);
+
+/* XXX: Oh the wonderful unknown */
+OUT_CS_REG_SEQ(0x4E54, 8);
+for (i = 0; i < 8; i++)
+    OUT_CS(0x00000000);
+OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x00000000);
+OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x00000000);
+OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFFFFFFFF);
+OUT_CS_REG(R300_ZB_FORMAT, 0x00000002);
+OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT, 0x00000003);
+OUT_CS_REG(R300_ZB_BW_CNTL, 0x00000000);
+OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0x00000000);
+OUT_CS_REG(0x4F30, 0x00000000);
+OUT_CS_REG(0x4F34, 0x00000000);
+OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0x00000000);
+OUT_CS_REG(R300_ZB_HIZ_PITCH, 0x00000000);
+R300_PACIFY;
+if (caps->has_tcl) {
+    OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0,
+        (R300_DATA_TYPE_FLOAT_4 << R300_DATA_TYPE_0_SHIFT) |
+        ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) |
+            R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT));
+} else {
+    OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0,
+        (R300_DATA_TYPE_FLOAT_4 << R300_DATA_TYPE_0_SHIFT) |
+        ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) |
+            R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT));
+}
+OUT_CS_REG(R300_FG_FOG_BLEND, 0x00000000);
+OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_EXT_0, 0xF688F688);
+OUT_CS_REG(R300_VAP_VTX_STATE_CNTL, 0x1);
+OUT_CS_REG(R300_VAP_VSM_VTX_ASSM, 0x405);
+OUT_CS_REG(R300_SE_VTE_CNTL, 0x0000043F);
+OUT_CS_REG(R300_VAP_VTX_SIZE, 0x00000008);
+OUT_CS_REG(R300_VAP_PSC_SGN_NORM_CNTL, 0xAAAAAAAA);
+OUT_CS_REG(R300_VAP_OUTPUT_VTX_FMT_0, 0x00000003);
+OUT_CS_REG(R300_VAP_OUTPUT_VTX_FMT_1, 0x00000000);
+OUT_CS_REG(R300_TX_ENABLE, 0x0);
+/* XXX viewport setup */
+OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
+OUT_CS_32F(1.0);
+OUT_CS_32F((float)x);
+OUT_CS_32F(1.0);
+OUT_CS_32F((float)y);
+OUT_CS_32F(1.0);
+OUT_CS_32F(0.0);
+
+if (caps->has_tcl) {
+    OUT_CS_REG(R300_VAP_CLIP_CNTL, R300_CLIP_DISABLE |
+        R300_PS_UCP_MODE_CLIP_AS_TRIFAN);
+}
+
+OUT_CS_REG(R300_GA_POINT_SIZE, ((h * 6) & R300_POINTSIZE_Y_MASK) |
+    ((w * 6) << R300_POINTSIZE_X_SHIFT));
+
+/* XXX RS block and fp setup */
+if (caps->is_r500) {
+    OUT_CS_REG_SEQ(R500_RS_IP_0, 8);
+    for (i = 0; i < 8; i++) {
+        /* I like the operator macros more than the shift macros... */
+        OUT_CS((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+            (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+            (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+            (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
+    }
+    /* XXX */
+    OUT_CS_REG_SEQ(R300_RS_COUNT, 2);
+    OUT_CS((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+    OUT_CS(0x0);
+    OUT_CS_REG(R500_RS_INST_0, R500_RS_INST_COL_CN_WRITE);
+
+    OUT_CS_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+    OUT_CS_REG(R500_US_PIXSIZE, 0x00000000);
+    OUT_CS_REG(R500_US_CODE_ADDR, R500_US_CODE_START_ADDR(0) |
+        R500_US_CODE_END_ADDR(1));
+    OUT_CS_REG(R500_US_CODE_RANGE, R500_US_CODE_RANGE_ADDR(0) |
+        R500_US_CODE_RANGE_SIZE(1));
+    OUT_CS_REG(R500_US_CODE_OFFSET, R500_US_CODE_OFFSET_ADDR(0));
+    R300_PACIFY;
+    OUT_CS_REG(R500_GA_US_VECTOR_INDEX,
+        0 | R500_GA_US_VECTOR_INDEX_TYPE_INSTR);
+    OUT_CS_REG(R500_GA_US_VECTOR_DATA,
+        R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT | R500_INST_LAST |
+        R500_INST_RGB_OMASK_R | R500_INST_RGB_OMASK_G | R500_INST_RGB_OMASK_B |
+        R500_INST_ALPHA_OMASK | R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP);
+    OUT_CS_REG(R500_GA_US_VECTOR_DATA,
+        R500_RGB_ADDR0(0) | R500_RGB_ADDR1(0) | R500_RGB_ADDR1_CONST |
+        R500_RGB_ADDR2(0) | R500_RGB_ADDR2_CONST);
+    OUT_CS_REG(R500_GA_US_VECTOR_DATA,
+        R500_ALPHA_ADDR0(0) | R500_ALPHA_ADDR1(0) | R500_ALPHA_ADDR1_CONST |
+        R500_ALPHA_ADDR2(0) | R500_ALPHA_ADDR2_CONST);
+    OUT_CS_REG(R500_GA_US_VECTOR_DATA,
+        R500_ALU_RGB_SEL_A_SRC0 | R500_ALU_RGB_R_SWIZ_A_R |
+        R500_ALU_RGB_G_SWIZ_A_G | R500_ALU_RGB_B_SWIZ_A_B |
+        R500_ALU_RGB_SEL_B_SRC0 | R500_ALU_RGB_R_SWIZ_B_R |
+        R500_ALU_RGB_B_SWIZ_B_G | R500_ALU_RGB_G_SWIZ_B_B);
+    OUT_CS_REG(R500_GA_US_VECTOR_DATA,
+        R500_ALPHA_OP_CMP | R500_ALPHA_SWIZ_A_A | R500_ALPHA_SWIZ_B_A);
+    OUT_CS_REG(R500_GA_US_VECTOR_DATA,
+        R500_ALU_RGBA_OP_CMP | R500_ALU_RGBA_R_SWIZ_0 |
+        R500_ALU_RGBA_G_SWIZ_0 | R500_ALU_RGBA_B_SWIZ_0 |
+        R500_ALU_RGBA_A_SWIZ_0);
+} else {
+    OUT_CS_REG_SEQ(R300_RS_IP_0, 8);
+    for (i = 0; i < 8; i++) {
+        OUT_CS(R300_RS_SEL_T(R300_RS_SEL_K0) |
+            R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1));
+    }
+    /* XXX */
+    OUT_CS_REG_SEQ(R300_RS_COUNT, 2);
+    OUT_CS((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+    OUT_CS(1);
+    OUT_CS_REG(R300_RS_INST_0, R300_RS_INST_COL_CN_WRITE);
+
+    /* XXX magic numbers */
+    OUT_CS_REG(R300_US_CONFIG, 0);
+    OUT_CS_REG(R300_US_PIXSIZE, 2);
+    OUT_CS_REG(R300_US_CODE_OFFSET, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_0, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_1, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_2, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_3, 0x400000);
+    OUT_CS_REG(R300_US_ALU_RGB_INST_0, 0x50A80);
+    OUT_CS_REG(R300_US_ALU_RGB_ADDR_0, 0x1C000000);
+    OUT_CS_REG(R300_US_ALU_ALPHA_INST_0, 0x40889);
+    OUT_CS_REG(R300_US_ALU_ALPHA_ADDR_0, 0x1000000);
+    OUT_CS_REG_SEQ(R300_US_OUT_FMT_0, 4);
+    OUT_CS(R300_C0_SEL_B | R300_C1_SEL_G | R300_C2_SEL_R | R300_C3_SEL_A);
+    OUT_CS(R300_US_OUT_FMT_UNUSED);
+    OUT_CS(R300_US_OUT_FMT_UNUSED);
+    OUT_CS(R300_US_OUT_FMT_UNUSED);
+    OUT_CS_REG(R300_US_W_FMT, R300_W_FMT_W0);
+}
+/* XXX these magic numbers should be explained when
+ * this becomes a cached state object */
+if (caps->has_tcl) {
+    OUT_CS_REG(R300_VAP_CNTL, 0xA |
+        (0x5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+        (0xB << R300_VF_MAX_VTX_NUM_SHIFT) |
+        (caps->num_vert_fpus << R300_PVS_NUM_FPUS_SHIFT));
+    OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_0, 0x00100000);
+    OUT_CS_REG(R300_VAP_PVS_CONST_CNTL, 0x00000000);
+    OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_1, 0x00000001);
+    R300_PACIFY;
+    /* XXX translate these back into normal instructions */
+    OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x1);
+    OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0x0);
+    OUT_CS_REG(R300_VAP_PVS_UPLOAD_DATA, 0xF00203);
+    OUT_CS_REG(R300_VAP_PVS_UPLOAD_DATA, 0xD10001);
+    OUT_CS_REG(R300_VAP_PVS_UPLOAD_DATA, 0x1248001);
+    OUT_CS_REG(R300_VAP_PVS_UPLOAD_DATA, 0x0);
+    OUT_CS_REG(R300_VAP_PVS_UPLOAD_DATA, 0xF02203);
+    OUT_CS_REG(R300_VAP_PVS_UPLOAD_DATA, 0xD10021);
+    OUT_CS_REG(R300_VAP_PVS_UPLOAD_DATA, 0x1248021);
+    OUT_CS_REG(R300_VAP_PVS_UPLOAD_DATA, 0x0);
+} else {
+    OUT_CS_REG(R300_VAP_CNTL, 0xA |
+        (0x5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+        (0x5 << R300_VF_MAX_VTX_NUM_SHIFT) |
+        (caps->num_vert_fpus << R300_PVS_NUM_FPUS_SHIFT));
+}
+R300_PACIFY;
+END_CS;
+
+r300_emit_blend_state(r300, &blend_clear_state);
+r300_emit_blend_color_state(r300, &blend_color_clear_state);
+r300_emit_dsa_state(r300, &dsa_clear_state);
+
+BEGIN_CS(36);
+R300_PACIFY;
+/* Flush colorbuffer and blend caches. */
+OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
+    R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D |
+    R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_SIGNAL);
+OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
+    R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+    R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+
+OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0, 1);
+OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+/* XXX this should not be so rigid and it still doesn't work right */
+OUT_CS_REG(R300_RB3D_COLORPITCH0, (dest->stride >> 2) | R300_COLOR_FORMAT_ARGB8888);
+OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0x0000000F);
+/* XXX Packet3 */
+OUT_CS(CP_PACKET3(R200_3D_DRAW_IMMD_2, 8));
+OUT_CS(R300_PRIM_TYPE_POINT | R300_PRIM_WALK_RING |
+(1 << R300_PRIM_NUM_VERTICES_SHIFT));
+OUT_CS_32F(w / 2.0);
+OUT_CS_32F(h / 2.0);
+/* XXX this should be the depth value to clear to */
+OUT_CS_32F(1.0);
+OUT_CS_32F(1.0);
+OUT_CS_32F(r);
+OUT_CS_32F(g);
+OUT_CS_32F(b);
+OUT_CS_32F(1.0);
+
+/* XXX figure out why this is 0xA and not 0x2 */
+OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT, 0xA);
+/* XXX OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
+    R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+    R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE); */
+R300_PACIFY;
+
+END_CS;
+FLUSH_CS;
+
+    r300->dirty_state = R300_NEW_KITCHEN_SINK;
+}
+
+void r300_init_surface_functions(struct r300_context* r300)
+{
+    r300->context.surface_fill = r300_surface_fill;
+}
diff --git a/src/gallium/drivers/r300/r300_surface.h b/src/gallium/drivers/r300/r300_surface.h
new file mode 100644
index 0000000000..e1d53116a1
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_surface.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_SURFACE_H
+#define R300_SURFACE_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+
+#include "util/u_rect.h"
+
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_emit.h"
+
+const struct r300_blend_state blend_clear_state = {
+    .blend_control = 0x0,
+    .alpha_blend_control = 0x0,
+    .rop = 0x0,
+    .dither = 0x0,
+};
+
+const struct r300_blend_color_state blend_color_clear_state = {
+    .blend_color = 0x0,
+    .blend_color_red_alpha = 0x0,
+    .blend_color_green_blue = 0x0,
+};
+
+const struct r300_dsa_state dsa_clear_state = {
+    .alpha_function = 0x0,
+    .alpha_reference = 0x0,
+    .z_buffer_control = 0x0,
+    .z_stencil_control = 0x0,
+    .stencil_ref_mask = R300_STENCILWRITEMASK_MASK,
+    .z_buffer_top = R300_ZTOP_ENABLE,
+    .stencil_ref_bf = 0x0,
+};
+
+#endif /* R300_SURFACE_H */
diff --git a/src/gallium/drivers/r300/r300_swtcl_emit.c b/src/gallium/drivers/r300/r300_swtcl_emit.c
new file mode 100644
index 0000000000..f6e98d23e9
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_swtcl_emit.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "draw/draw_pipe.h"
+#include "util/u_memory.h"
+
+#include "r300_cs.h"
+#include "r300_context.h"
+#include "r300_reg.h"
+
+/* r300_swtcl_emit: Primitive vertex emission using an immediate
+ * vertex buffer and no HW TCL. */
+
+struct swtcl_stage {
+    /* Parent class */
+    struct draw_stage draw;
+
+    struct r300_context* r300;
+};
+
+static INLINE struct swtcl_stage* swtcl_stage(struct draw_stage* draw) {
+    return (struct swtcl_stage*)draw;
+}
+
+static void r300_emit_vertex(struct r300_context* r300,
+                             const struct vertex_header* vertex)
+{
+    /* XXX */
+}
+
+static INLINE void r300_emit_prim(struct draw_stage* draw,
+                                  struct prim_header* prim,
+                                  unsigned hwprim,
+                                  unsigned count)
+{
+    struct r300_context* r300 = swtcl_stage(draw)->r300;
+    CS_LOCALS(r300);
+    int i;
+
+    r300_emit_dirty_state(r300);
+
+    /* XXX should be count * vtx size */
+    BEGIN_CS(2 + count + 6);
+    OUT_CS(CP_PACKET3(R200_3D_DRAW_IMMD_2, count));
+    OUT_CS(hwprim | R300_PRIM_WALK_RING |
+        (count << R300_PRIM_NUM_VERTICES_SHIFT));
+
+    for (i = 0; i < count; i++) {
+        r300_emit_vertex(r300, prim->v[i]);
+    }
+    R300_PACIFY;
+    END_CS;
+}
+
+/* Just as an aside...
+ *
+ * Radeons can do many more primitives:
+ * - Line strip
+ * - Triangle fan
+ * - Triangle strip
+ * - Line loop
+ * - Quads
+ * - Quad strip
+ * - Polygons
+ *
+ * The following were just the only ones in Draw. */
+
+static void r300_emit_point(struct draw_stage* draw, struct prim_header* prim)
+{
+    r300_emit_prim(draw, prim, R300_PRIM_TYPE_POINT, 1);
+}
+
+static void r300_emit_line(struct draw_stage* draw, struct prim_header* prim)
+{
+    r300_emit_prim(draw, prim, R300_PRIM_TYPE_LINE, 2);
+}
+
+static void r300_emit_tri(struct draw_stage* draw, struct prim_header* prim)
+{
+    r300_emit_prim(draw, prim, R300_PRIM_TYPE_TRI_LIST, 3);
+}
+
+static void r300_swtcl_flush(struct draw_stage* draw, unsigned flags)
+{
+}
+
+static void r300_reset_stipple(struct draw_stage* draw)
+{
+    /* XXX */
+}
+
+static void r300_swtcl_destroy(struct draw_stage* draw)
+{
+    FREE(draw);
+}
+
+struct draw_stage* r300_draw_swtcl_stage(struct r300_context* r300)
+{
+    struct swtcl_stage* swtcl = CALLOC_STRUCT(swtcl_stage);
+
+    swtcl->r300 = r300;
+    swtcl->draw.point = r300_emit_point;
+    swtcl->draw.line = r300_emit_line;
+    swtcl->draw.tri = r300_emit_tri;
+    swtcl->draw.flush = r300_swtcl_flush;
+    swtcl->draw.reset_stipple_counter = r300_reset_stipple;
+    swtcl->draw.destroy = r300_swtcl_destroy;
+
+    return &swtcl->draw;
+}
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
new file mode 100644
index 0000000000..ff812c09f8
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_texture.h"
+
+static int minify(int i)
+{
+    return MAX2(1, i >> 1);
+}
+
+static void r300_setup_miptree(struct r300_texture* tex)
+{
+    struct pipe_texture* base = &tex->tex;
+    int stride, size, offset;
+
+    for (int i = 0; i <= base->last_level; i++) {
+        if (i > 0) {
+            base->width[i] = minify(base->width[i-1]);
+            base->height[i] = minify(base->height[i-1]);
+            base->depth[i] = minify(base->depth[i-1]);
+        }
+
+        base->nblocksx[i] = pf_get_nblocksx(&base->block, base->width[i]);
+        base->nblocksy[i] = pf_get_nblocksy(&base->block, base->width[i]);
+
+        /* Radeons enjoy things in multiples of 32. */
+        /* XXX NPOT -> 64, not 32 */
+        stride = (base->nblocksx[i] * base->block.size + 63) & ~63;
+        size = stride * base->nblocksy[i] * base->depth[i];
+
+        /* XXX 64 for NPOT */
+        tex->offset[i] = (tex->size + 63) & ~63;
+        tex->size = tex->offset[i] + size;
+    }
+}
+
+/* Create a new texture. */
+static struct pipe_texture*
+    r300_texture_create(struct pipe_screen* screen,
+                        const struct pipe_texture* template)
+{
+    /* XXX struct r300_screen* r300screen = r300_screen(screen); */
+
+    struct r300_texture* tex = CALLOC_STRUCT(r300_texture);
+
+    if (!tex) {
+        return NULL;
+    }
+
+    tex->tex = *template;
+    tex->tex.refcount = 1;
+    tex->tex.screen = screen;
+
+    r300_setup_miptree(tex);
+
+    tex->buffer = screen->buffer_create(screen, 63,
+                                        PIPE_BUFFER_USAGE_PIXEL,
+                                        tex->size);
+
+    if (!tex->buffer) {
+        FREE(tex);
+        return NULL;
+    }
+
+    return (struct pipe_texture*)tex;
+}
+
+static void r300_texture_release(struct pipe_screen* screen,
+                                 struct pipe_texture** texture)
+{
+    if (!*texture) {
+        return;
+    }
+
+    (*texture)->refcount--;
+
+    if ((*texture)->refcount <= 0) {
+        struct r300_texture* tex = (struct r300_texture*)*texture;
+
+        pipe_buffer_reference(screen, &tex->buffer, NULL);
+
+        FREE(tex);
+    }
+
+    *texture = NULL;
+}
+
+static struct pipe_surface* r300_get_tex_surface(struct pipe_screen* screen,
+                                                 struct pipe_texture* texture,
+                                                 unsigned face,
+                                                 unsigned level,
+                                                 unsigned zslice,
+                                                 unsigned flags)
+{
+    struct r300_texture* tex = (struct r300_texture*)texture;
+    struct pipe_surface* surface = CALLOC_STRUCT(pipe_surface);
+    unsigned offset;
+
+    /* XXX this is certainly dependent on tex target */
+    offset = tex->offset[level];
+
+    if (surface) {
+        surface->refcount = 1;
+        pipe_texture_reference(&surface->texture, texture);
+        surface->format = texture->format;
+        surface->width = texture->width[level];
+        surface->height = texture->height[level];
+        surface->block = texture->block;
+        surface->nblocksx = texture->nblocksx[level];
+        surface->nblocksy = texture->nblocksy[level];
+        /* XXX save the actual stride instead plz kthnxbai */
+        surface->stride =
+            (texture->nblocksx[level] * texture->block.size + 63) & ~63;
+        surface->offset = offset;
+        surface->usage = flags;
+        surface->status = PIPE_SURFACE_STATUS_DEFINED;
+    }
+
+    return surface;
+}
+
+static void r300_tex_surface_release(struct pipe_screen* screen,
+                                     struct pipe_surface** surface)
+{
+    struct pipe_surface* s = *surface;
+
+    s->refcount--;
+
+    if (s->refcount <= 0) {
+        pipe_texture_reference(&s->texture, NULL);
+        FREE(s);
+    }
+
+    *surface = NULL;
+}
+
+static struct pipe_texture*
+    r300_texture_blanket(struct pipe_screen* screen,
+                         const struct pipe_texture* base,
+                         const unsigned* stride,
+                         struct pipe_buffer* buffer)
+{
+    struct r300_texture* tex;
+
+    if (base->target != PIPE_TEXTURE_2D ||
+        base->last_level != 0 ||
+        base->depth[0] != 1) {
+        return NULL;
+    }
+
+    tex = CALLOC_STRUCT(r300_texture);
+    if (!tex) {
+        return NULL;
+    }
+
+    tex->tex = *base;
+    tex->tex.refcount = 1;
+    tex->tex.screen = screen;
+
+    /* XXX tex->stride = *stride; */
+
+    pipe_buffer_reference(screen, &tex->buffer, buffer);
+
+    return (struct pipe_texture*)tex;
+}
+
+void r300_init_screen_texture_functions(struct pipe_screen* screen)
+{
+    screen->texture_create = r300_texture_create;
+    screen->texture_release = r300_texture_release;
+    screen->get_tex_surface = r300_get_tex_surface;
+    screen->tex_surface_release = r300_tex_surface_release;
+    screen->texture_blanket = r300_texture_blanket;
+}
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
new file mode 100644
index 0000000000..7964229a94
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_TEXTURE_H
+#define R300_TEXTURE_H
+
+#include "pipe/p_screen.h"
+
+#include "util/u_math.h"
+
+#include "r300_context.h"
+
+void r300_init_screen_texture_functions(struct pipe_screen* screen);
+
+#endif /* R300_TEXTURE_H */
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
new file mode 100644
index 0000000000..5a3a212892
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_WINSYS_H
+#define R300_WINSYS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The public interface header for the r300 pipe driver.
+ * Any winsys hosting this pipe needs to implement r300_winsys and then
+ * call r300_create_context to start things. */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+struct radeon_cs;
+
+struct r300_winsys {
+
+    /* PCI ID */
+    uint32_t pci_id;
+
+    /* GB pipe count */
+    uint32_t gb_pipes;
+
+    /* CS object. This is very much like Intel's batchbuffer.
+     * Fill it full of dwords and relocs and then submit.
+     * Repeat as needed. */
+    /* Note: Unlike Mesa's version of this, we don't keep a copy of the CSM
+     * that was used to create this CS. Is this a good idea? */
+    /* Note: The pipe driver doesn't know how to use this. This is purely
+     * for the winsys. */
+    struct radeon_cs* cs;
+
+    /* Check to see if there's room for commands. */
+    boolean (*check_cs)(struct radeon_cs* cs, int size);
+
+    /* Start a command emit. */
+    void (*begin_cs)(struct radeon_cs* cs,
+           int size,
+           const char* file,
+           const char* function,
+           int line);
+
+    /* Write a dword to the command buffer. */
+    void (*write_cs_dword)(struct radeon_cs* cs, uint32_t dword);
+
+    /* Write a relocated dword to the command buffer. */
+    void (*write_cs_reloc)(struct radeon_cs* cs,
+           struct pipe_buffer* bo,
+           uint32_t rd,
+           uint32_t wd,
+           uint32_t flags);
+
+    /* Finish a command emit. */
+    void (*end_cs)(struct radeon_cs* cs,
+           const char* file,
+           const char* function,
+           int line);
+
+    /* Flush the CS. */
+    void (*flush_cs)(struct radeon_cs* cs);
+};
+
+struct pipe_context* r300_create_context(struct pipe_screen* screen,
+                                         struct pipe_winsys* winsys,
+                                         struct r300_winsys* r300_winsys);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* R300_WINSYS_H */
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
new file mode 100644
index 0000000000..120bdfd9dd
--- /dev/null
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -0,0 +1,47 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = softpipe
+
+C_SOURCES = \
+	sp_fs_exec.c \
+	sp_fs_sse.c \
+	sp_fs_llvm.c \
+	sp_clear.c \
+	sp_flush.c \
+	sp_query.c \
+	sp_context.c \
+	sp_draw_arrays.c \
+	sp_prim_setup.c \
+	sp_prim_vbuf.c \
+	sp_quad.c \
+	sp_quad_alpha_test.c \
+	sp_quad_blend.c \
+	sp_quad_colormask.c \
+	sp_quad_coverage.c \
+	sp_quad_depth_test.c \
+	sp_quad_earlyz.c \
+	sp_quad_fs.c \
+	sp_quad_occlusion.c \
+	sp_quad_output.c \
+	sp_quad_stencil.c \
+	sp_quad_stipple.c \
+	sp_screen.c \
+        sp_setup.c \
+	sp_state_blend.c \
+	sp_state_clip.c \
+	sp_state_derived.c \
+	sp_state_fs.c \
+	sp_state_sampler.c \
+	sp_state_rasterizer.c \
+	sp_state_surface.c \
+	sp_state_vertex.c \
+	sp_texture.c \
+	sp_tex_sample.c \
+	sp_tile_cache.c \
+	sp_surface.c 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
new file mode 100644
index 0000000000..c1f7daa8ab
--- /dev/null
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -0,0 +1,46 @@
+Import('*')
+
+env = env.Clone()
+
+softpipe = env.ConvenienceLibrary(
+	target = 'softpipe',
+	source = [
+		'sp_fs_exec.c',
+		'sp_fs_sse.c',
+		'sp_fs_llvm.c',
+		'sp_clear.c',
+		'sp_context.c',
+		'sp_draw_arrays.c',
+		'sp_flush.c',
+		'sp_prim_setup.c',
+		'sp_prim_vbuf.c',
+		'sp_setup.c',
+		'sp_quad_alpha_test.c',
+		'sp_quad_blend.c',
+		'sp_quad.c',
+		'sp_quad_colormask.c',
+		'sp_quad_coverage.c',
+		'sp_quad_depth_test.c',
+		'sp_quad_earlyz.c',
+		'sp_quad_fs.c',
+		'sp_quad_occlusion.c',
+		'sp_quad_output.c',
+		'sp_quad_stencil.c',
+		'sp_quad_stipple.c',
+		'sp_query.c',
+		'sp_screen.c',
+		'sp_state_blend.c',
+		'sp_state_clip.c',
+		'sp_state_derived.c',
+		'sp_state_fs.c',
+		'sp_state_rasterizer.c',
+		'sp_state_sampler.c',
+		'sp_state_surface.c',
+		'sp_state_vertex.c',
+		'sp_surface.c',
+		'sp_tex_sample.c',
+		'sp_texture.c',
+		'sp_tile_cache.c',
+	])
+
+Export('softpipe')
+\ No newline at end of file
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
new file mode 100644
index 0000000000..ad108ec446
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -0,0 +1,107 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_pack_color.h"
+#include "sp_clear.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_state.h"
+#include "sp_tile_cache.h"
+
+
+/**
+ * Convert packed pixel from one format to another.
+ */
+static unsigned
+convert_color(enum pipe_format srcFormat, unsigned srcColor,
+              enum pipe_format dstFormat)
+{
+   ubyte r, g, b, a;
+   unsigned dstColor;
+
+   util_unpack_color_ub(srcFormat, &srcColor, &r, &g, &b, &a);
+   util_pack_color_ub(r, g, b, a, dstFormat, &dstColor);
+
+   return dstColor;
+}
+
+
+
+/**
+ * Clear the given surface to the specified value.
+ * No masking, no scissor (clear entire buffer).
+ * Note: when clearing a color buffer, the clearValue is always
+ * encoded as PIPE_FORMAT_A8R8G8B8_UNORM.
+ */
+void
+softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+               unsigned clearValue)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   if (softpipe->no_rast)
+      return;
+
+#if 0
+   softpipe_update_derived(softpipe); /* not needed?? */
+#endif
+
+   if (ps == sp_tile_cache_get_surface(softpipe->zsbuf_cache)) {
+      sp_tile_cache_clear(softpipe->zsbuf_cache, clearValue);
+      softpipe->framebuffer.zsbuf->status = PIPE_SURFACE_STATUS_CLEAR;
+#if TILE_CLEAR_OPTIMIZATION
+      return;
+#endif
+   }
+
+   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
+      if (ps == sp_tile_cache_get_surface(softpipe->cbuf_cache[i])) {
+         unsigned cv;
+         if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) {
+            cv = convert_color(PIPE_FORMAT_A8R8G8B8_UNORM, clearValue,
+                               ps->format);
+         }
+         else {
+            cv = clearValue;
+         }
+         sp_tile_cache_clear(softpipe->cbuf_cache[i], cv);
+         softpipe->framebuffer.cbufs[i]->status = PIPE_SURFACE_STATUS_CLEAR;
+      }
+   }
+
+#if !TILE_CLEAR_OPTIMIZATION
+   /* non-cached surface */
+   pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+#endif
+}
diff --git a/src/gallium/drivers/softpipe/sp_clear.h b/src/gallium/drivers/softpipe/sp_clear.h
new file mode 100644
index 0000000000..a8ed1c4ecc
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_clear.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+#ifndef SP_CLEAR_H
+#define SP_CLEAR_H
+
+#include "pipe/p_state.h"
+struct pipe_context;
+
+extern void
+softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+               unsigned clearValue);
+
+
+#endif /* SP_CLEAR_H */
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
new file mode 100644
index 0000000000..c2d882a819
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -0,0 +1,288 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "sp_clear.h"
+#include "sp_context.h"
+#include "sp_flush.h"
+#include "sp_prim_setup.h"
+#include "sp_prim_vbuf.h"
+#include "sp_state.h"
+#include "sp_surface.h"
+#include "sp_tile_cache.h"
+#include "sp_texture.h"
+#include "sp_winsys.h"
+#include "sp_query.h"
+
+
+
+/**
+ * Map any drawing surfaces which aren't already mapped
+ */
+void
+softpipe_map_surfaces(struct softpipe_context *sp)
+{
+   unsigned i;
+
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
+      sp_tile_cache_map_surfaces(sp->cbuf_cache[i]);
+   }
+
+   sp_tile_cache_map_surfaces(sp->zsbuf_cache);
+}
+
+
+/**
+ * Unmap any mapped drawing surfaces
+ */
+void
+softpipe_unmap_surfaces(struct softpipe_context *sp)
+{
+   uint i;
+
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++)
+      sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
+   sp_flush_tile_cache(sp, sp->zsbuf_cache);
+
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
+      sp_tile_cache_unmap_surfaces(sp->cbuf_cache[i]);
+   }
+   sp_tile_cache_unmap_surfaces(sp->zsbuf_cache);
+}
+
+
+static void softpipe_destroy( struct pipe_context *pipe )
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   struct pipe_screen *screen = pipe->screen;
+   uint i;
+
+   if (softpipe->draw)
+      draw_destroy( softpipe->draw );
+
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      softpipe->quad[i].polygon_stipple->destroy( softpipe->quad[i].polygon_stipple );
+      softpipe->quad[i].earlyz->destroy( softpipe->quad[i].earlyz );
+      softpipe->quad[i].shade->destroy( softpipe->quad[i].shade );
+      softpipe->quad[i].alpha_test->destroy( softpipe->quad[i].alpha_test );
+      softpipe->quad[i].depth_test->destroy( softpipe->quad[i].depth_test );
+      softpipe->quad[i].stencil_test->destroy( softpipe->quad[i].stencil_test );
+      softpipe->quad[i].occlusion->destroy( softpipe->quad[i].occlusion );
+      softpipe->quad[i].coverage->destroy( softpipe->quad[i].coverage );
+      softpipe->quad[i].blend->destroy( softpipe->quad[i].blend );
+      softpipe->quad[i].colormask->destroy( softpipe->quad[i].colormask );
+      softpipe->quad[i].output->destroy( softpipe->quad[i].output );
+   }
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+      sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
+   sp_destroy_tile_cache(softpipe->zsbuf_cache);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+      sp_destroy_tile_cache(softpipe->tex_cache[i]);
+
+   for (i = 0; i < Elements(softpipe->constants); i++) {
+      if (softpipe->constants[i].buffer) {
+         pipe_buffer_reference(screen, &softpipe->constants[i].buffer, NULL);
+      }
+   }
+
+   FREE( softpipe );
+}
+
+
+struct pipe_context *
+softpipe_create( struct pipe_screen *screen,
+                 struct pipe_winsys *pipe_winsys,
+                 void *unused )
+{
+   struct softpipe_context *softpipe = CALLOC_STRUCT(softpipe_context);
+   uint i;
+
+   util_init_math();
+
+#ifdef PIPE_ARCH_X86
+   softpipe->use_sse = !debug_get_bool_option( "GALLIUM_NOSSE", FALSE );
+#else
+   softpipe->use_sse = FALSE;
+#endif
+
+   softpipe->dump_fs = debug_get_bool_option( "GALLIUM_DUMP_FS", FALSE );
+
+   softpipe->pipe.winsys = pipe_winsys;
+   softpipe->pipe.screen = screen;
+   softpipe->pipe.destroy = softpipe_destroy;
+
+   /* state setters */
+   softpipe->pipe.create_blend_state = softpipe_create_blend_state;
+   softpipe->pipe.bind_blend_state   = softpipe_bind_blend_state;
+   softpipe->pipe.delete_blend_state = softpipe_delete_blend_state;
+
+   softpipe->pipe.create_sampler_state = softpipe_create_sampler_state;
+   softpipe->pipe.bind_sampler_states  = softpipe_bind_sampler_states;
+   softpipe->pipe.delete_sampler_state = softpipe_delete_sampler_state;
+
+   softpipe->pipe.create_depth_stencil_alpha_state = softpipe_create_depth_stencil_state;
+   softpipe->pipe.bind_depth_stencil_alpha_state   = softpipe_bind_depth_stencil_state;
+   softpipe->pipe.delete_depth_stencil_alpha_state = softpipe_delete_depth_stencil_state;
+
+   softpipe->pipe.create_rasterizer_state = softpipe_create_rasterizer_state;
+   softpipe->pipe.bind_rasterizer_state   = softpipe_bind_rasterizer_state;
+   softpipe->pipe.delete_rasterizer_state = softpipe_delete_rasterizer_state;
+
+   softpipe->pipe.create_fs_state = softpipe_create_fs_state;
+   softpipe->pipe.bind_fs_state   = softpipe_bind_fs_state;
+   softpipe->pipe.delete_fs_state = softpipe_delete_fs_state;
+
+   softpipe->pipe.create_vs_state = softpipe_create_vs_state;
+   softpipe->pipe.bind_vs_state   = softpipe_bind_vs_state;
+   softpipe->pipe.delete_vs_state = softpipe_delete_vs_state;
+
+   softpipe->pipe.set_blend_color = softpipe_set_blend_color;
+   softpipe->pipe.set_clip_state = softpipe_set_clip_state;
+   softpipe->pipe.set_constant_buffer = softpipe_set_constant_buffer;
+   softpipe->pipe.set_framebuffer_state = softpipe_set_framebuffer_state;
+   softpipe->pipe.set_polygon_stipple = softpipe_set_polygon_stipple;
+   softpipe->pipe.set_scissor_state = softpipe_set_scissor_state;
+   softpipe->pipe.set_sampler_textures = softpipe_set_sampler_textures;
+   softpipe->pipe.set_viewport_state = softpipe_set_viewport_state;
+
+   softpipe->pipe.set_vertex_buffers = softpipe_set_vertex_buffers;
+   softpipe->pipe.set_vertex_elements = softpipe_set_vertex_elements;
+
+   softpipe->pipe.draw_arrays = softpipe_draw_arrays;
+   softpipe->pipe.draw_elements = softpipe_draw_elements;
+   softpipe->pipe.draw_range_elements = softpipe_draw_range_elements;
+   softpipe->pipe.set_edgeflags = softpipe_set_edgeflags;
+
+
+   softpipe->pipe.clear = softpipe_clear;
+   softpipe->pipe.flush = softpipe_flush;
+
+   softpipe_init_query_funcs( softpipe );
+   softpipe_init_texture_funcs( softpipe );
+
+   /*
+    * Alloc caches for accessing drawing surfaces and textures.
+    * Must be before quad stage setup!
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+      softpipe->cbuf_cache[i] = sp_create_tile_cache( screen );
+   softpipe->zsbuf_cache = sp_create_tile_cache( screen );
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+      softpipe->tex_cache[i] = sp_create_tile_cache( screen );
+
+
+   /* setup quad rendering stages */
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      softpipe->quad[i].polygon_stipple = sp_quad_polygon_stipple_stage(softpipe);
+      softpipe->quad[i].earlyz = sp_quad_earlyz_stage(softpipe);
+      softpipe->quad[i].shade = sp_quad_shade_stage(softpipe);
+      softpipe->quad[i].alpha_test = sp_quad_alpha_test_stage(softpipe);
+      softpipe->quad[i].depth_test = sp_quad_depth_test_stage(softpipe);
+      softpipe->quad[i].stencil_test = sp_quad_stencil_test_stage(softpipe);
+      softpipe->quad[i].occlusion = sp_quad_occlusion_stage(softpipe);
+      softpipe->quad[i].coverage = sp_quad_coverage_stage(softpipe);
+      softpipe->quad[i].blend = sp_quad_blend_stage(softpipe);
+      softpipe->quad[i].colormask = sp_quad_colormask_stage(softpipe);
+      softpipe->quad[i].output = sp_quad_output_stage(softpipe);
+   }
+
+   /* vertex shader samplers */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      softpipe->tgsi.vert_samplers[i].base.get_samples = sp_get_samples_vertex;
+      softpipe->tgsi.vert_samplers[i].unit = i;
+      softpipe->tgsi.vert_samplers[i].sp = softpipe;
+      softpipe->tgsi.vert_samplers[i].cache = softpipe->tex_cache[i];
+      softpipe->tgsi.vert_samplers_list[i] = &softpipe->tgsi.vert_samplers[i];
+   }
+
+   /* fragment shader samplers */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      softpipe->tgsi.frag_samplers[i].base.get_samples = sp_get_samples_fragment;
+      softpipe->tgsi.frag_samplers[i].unit = i;
+      softpipe->tgsi.frag_samplers[i].sp = softpipe;
+      softpipe->tgsi.frag_samplers[i].cache = softpipe->tex_cache[i];
+      softpipe->tgsi.frag_samplers_list[i] = &softpipe->tgsi.frag_samplers[i];
+   }
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   softpipe->draw = draw_create();
+   if (!softpipe->draw) 
+      goto fail;
+
+   draw_texture_samplers(softpipe->draw,
+                         PIPE_MAX_SAMPLERS,
+                         (struct tgsi_sampler **)
+                            softpipe->tgsi.vert_samplers_list);
+
+   softpipe->setup = sp_draw_render_stage(softpipe);
+   if (!softpipe->setup)
+      goto fail;
+
+   if (debug_get_bool_option( "SP_NO_RAST", FALSE ))
+      softpipe->no_rast = TRUE;
+
+   if (debug_get_bool_option( "SP_NO_VBUF", FALSE )) {
+      /* Deprecated path -- vbuf is the intended interface to the draw module:
+       */
+      draw_set_rasterize_stage(softpipe->draw, softpipe->setup);
+   }
+   else {
+      sp_init_vbuf(softpipe);
+   }
+
+   /* plug in AA line/point stages */
+   draw_install_aaline_stage(softpipe->draw, &softpipe->pipe);
+   draw_install_aapoint_stage(softpipe->draw, &softpipe->pipe);
+
+#if USE_DRAW_STAGE_PSTIPPLE
+   /* Do polygon stipple w/ texture map + frag prog? */
+   draw_install_pstipple_stage(softpipe->draw, &softpipe->pipe);
+#endif
+
+   sp_init_surface_functions(softpipe);
+
+   return &softpipe->pipe;
+
+ fail:
+   softpipe_destroy(&softpipe->pipe);
+   return NULL;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
new file mode 100644
index 0000000000..e2451c6ecb
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -0,0 +1,174 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_CONTEXT_H
+#define SP_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+
+#include "draw/draw_vertex.h"
+
+#include "sp_quad.h"
+#include "sp_tex_sample.h"
+
+
+/**
+ * This is a temporary variable for testing draw-stage polygon stipple.
+ * If zero, do stipple in sp_quad_stipple.c
+ */
+#define USE_DRAW_STAGE_PSTIPPLE 1
+
+/* Number of threads working on individual quads.
+ * Setting to 1 disables this feature.
+ */
+#define SP_NUM_QUAD_THREADS 1
+
+struct softpipe_winsys;
+struct softpipe_vbuf_render;
+struct draw_context;
+struct draw_stage;
+struct softpipe_tile_cache;
+struct sp_fragment_shader;
+struct sp_vertex_shader;
+
+
+struct softpipe_context {
+   struct pipe_context pipe;  /**< base class */
+
+   /* The most recent drawing state as set by the driver:
+    */
+   const struct pipe_blend_state   *blend;
+   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   const struct pipe_depth_stencil_alpha_state   *depth_stencil;
+   const struct pipe_rasterizer_state *rasterizer;
+   const struct sp_fragment_shader *fs;
+   const struct sp_vertex_shader *vs;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+   unsigned dirty;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+   unsigned num_vertex_elements;
+   unsigned num_vertex_buffers;
+
+   boolean no_rast;
+
+   /* Counter for occlusion queries.  Note this supports overlapping
+    * queries.
+    */
+   uint64_t occlusion_count;
+
+   /*
+    * Mapped vertex buffers
+    */
+   ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
+   
+   /** Mapped constant buffers */
+   void *mapped_constants[PIPE_SHADER_TYPES];
+
+   /** Vertex format */
+   struct vertex_info vertex_info;
+   struct vertex_info vertex_info_vbuf;
+
+   int psize_slot;
+
+   unsigned reduced_api_prim;  /**< PIPE_PRIM_POINTS, _LINES or _TRIANGLES */
+
+#if 0
+   /* Stipple derived state:
+    */
+   ubyte stipple_masks[16][16];
+#endif
+
+   /** Derived from scissor and surface bounds: */
+   struct pipe_scissor_state cliprect;
+
+   unsigned line_stipple_counter;
+
+   /** Software quad rendering pipeline */
+   struct {
+      struct quad_stage *polygon_stipple;
+      struct quad_stage *earlyz;
+      struct quad_stage *shade;
+      struct quad_stage *alpha_test;
+      struct quad_stage *stencil_test;
+      struct quad_stage *depth_test;
+      struct quad_stage *occlusion;
+      struct quad_stage *coverage;
+      struct quad_stage *blend;
+      struct quad_stage *colormask;
+      struct quad_stage *output;
+
+      struct quad_stage *first; /**< points to one of the above stages */
+   } quad[SP_NUM_QUAD_THREADS];
+
+   /** TGSI exec things */
+   struct {
+      struct sp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
+      struct sp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
+      struct sp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
+   } tgsi;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+   struct draw_stage *setup;
+   struct draw_stage *vbuf;
+   struct softpipe_vbuf_render *vbuf_render;
+
+   struct softpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS];
+   struct softpipe_tile_cache *zsbuf_cache;
+
+   struct softpipe_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+
+   int use_sse : 1;
+   int dump_fs : 1;
+};
+
+
+static INLINE struct softpipe_context *
+softpipe_context( struct pipe_context *pipe )
+{
+   return (struct softpipe_context *)pipe;
+}
+
+#endif /* SP_CONTEXT_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
new file mode 100644
index 0000000000..7e3a25e34b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -0,0 +1,208 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+
+#include "draw/draw_context.h"
+
+
+
+static void
+softpipe_map_constant_buffers(struct softpipe_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i, size;
+
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
+         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
+                                                  PIPE_BUFFER_USAGE_CPU_READ);
+   }
+
+   if (sp->constants[PIPE_SHADER_VERTEX].buffer)
+      size = sp->constants[PIPE_SHADER_VERTEX].buffer->size;
+   else
+      size = 0;
+
+   draw_set_mapped_constant_buffer(sp->draw,
+                                   sp->mapped_constants[PIPE_SHADER_VERTEX],
+                                   size);
+}
+
+static void
+softpipe_unmap_constant_buffers(struct softpipe_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i;
+
+   /* really need to flush all prims since the vert/frag shaders const buffers
+    * are going away now.
+    */
+   draw_flush(sp->draw);
+
+   draw_set_mapped_constant_buffer(sp->draw, NULL, 0);
+
+   for (i = 0; i < 2; i++) {
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
+         ws->buffer_unmap(ws, sp->constants[i].buffer);
+      sp->mapped_constants[i] = NULL;
+   }
+}
+
+
+static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES
+};
+
+
+boolean
+softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   return softpipe_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
+
+
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ *
+ * XXX should the element buffer be specified/bound with a separate function?
+ */
+
+boolean
+softpipe_draw_range_elements(struct pipe_context *pipe,
+                             struct pipe_buffer *indexBuffer,
+                             unsigned indexSize,
+                             unsigned min_index,
+                             unsigned max_index,
+                             unsigned mode, unsigned start, unsigned count)
+{
+   struct softpipe_context *sp = softpipe_context(pipe);
+   struct draw_context *draw = sp->draw;
+   unsigned i;
+
+   sp->reduced_api_prim = reduced_prim[mode];
+
+   if (sp->dirty)
+      softpipe_update_derived( sp );
+
+   softpipe_map_surfaces(sp);
+   softpipe_map_constant_buffers(sp);
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      void *buf
+         = pipe_buffer_map(pipe->screen,
+                                    sp->vertex_buffer[i].buffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes
+         = pipe_buffer_map(pipe->screen, indexBuffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_element_buffer_range(draw, indexSize,
+                                           min_index,
+                                           max_index,
+                                           mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer_range(draw, 0, start, start + count - 1, NULL);
+   }
+
+
+   /* draw! */
+   draw_arrays(draw, mode, start, count);
+
+   /*
+    * unmap vertex/index buffers - will cause draw module to flush
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+      pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
+   }
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
+   }
+
+
+   /* Note: leave drawing surfaces mapped */
+   softpipe_unmap_constant_buffers(sp);
+
+   return TRUE;
+}
+
+boolean
+softpipe_draw_elements(struct pipe_context *pipe,
+                       struct pipe_buffer *indexBuffer,
+                       unsigned indexSize,
+                       unsigned mode, unsigned start, unsigned count)
+{
+   return softpipe_draw_range_elements( pipe, indexBuffer,
+                                        indexSize,
+                                        0, 0xffffffff,
+                                        mode, start, count );
+}
+
+
+
+void
+softpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
+{
+   struct softpipe_context *sp = softpipe_context(pipe);
+   draw_set_edgeflags(sp->draw, edgeflags);
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
new file mode 100644
index 0000000000..c21faf57f3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -0,0 +1,92 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "draw/draw_context.h"
+#include "sp_flush.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_state.h"
+#include "sp_tile_cache.h"
+#include "sp_winsys.h"
+
+
+void
+softpipe_flush( struct pipe_context *pipe,
+		unsigned flags,
+                struct pipe_fence_handle **fence )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   draw_flush(softpipe->draw);
+
+   if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+      for (i = 0; i < softpipe->num_textures; i++) {
+         sp_flush_tile_cache(softpipe, softpipe->tex_cache[i]);
+      }
+   }
+
+   if (flags & PIPE_FLUSH_RENDER_CACHE) {
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+         if (softpipe->cbuf_cache[i])
+            sp_flush_tile_cache(softpipe, softpipe->cbuf_cache[i]);
+
+      if (softpipe->zsbuf_cache)
+         sp_flush_tile_cache(softpipe, softpipe->zsbuf_cache);
+
+      /* Need this call for hardware buffers before swapbuffers.
+       *
+       * there should probably be another/different flush-type function
+       * that's called before swapbuffers because we don't always want
+       * to unmap surfaces when flushing.
+       */
+      softpipe_unmap_surfaces(softpipe);
+   }
+
+   /* Enable to dump BMPs of the color/depth buffers each frame */
+#if 0
+   if(flags & PIPE_FLUSH_FRAME) {
+      static unsigned frame_no = 1;
+      static char filename[256];
+      util_snprintf(filename, sizeof(filename), "cbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(filename, softpipe->framebuffer.cbufs[0]);
+      util_snprintf(filename, sizeof(filename), "zsbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(filename, softpipe->framebuffer.zsbuf);
+      ++frame_no;
+   }
+#endif
+   
+   if (fence)
+      *fence = NULL;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_flush.h b/src/gallium/drivers/softpipe/sp_flush.h
new file mode 100644
index 0000000000..68d9b5fa83
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_flush.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_FLUSH_H
+#define SP_FLUSH_H
+
+struct pipe_context;
+struct pipe_fence_handle;
+
+void softpipe_flush(struct pipe_context *pipe, unsigned flags,
+                    struct pipe_fence_handle **fence);
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs.h b/src/gallium/drivers/softpipe/sp_fs.h
new file mode 100644
index 0000000000..4792ace3a3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_FS_H
+#define SP_FS_H
+
+struct sp_fragment_shader *
+softpipe_create_fs_exec(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ);
+
+struct sp_fragment_shader *
+softpipe_create_fs_sse(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ);
+
+struct sp_fragment_shader *
+softpipe_create_fs_llvm(struct softpipe_context *softpipe,
+			const struct pipe_shader_state *templ);
+
+struct tgsi_interp_coef;
+struct tgsi_exec_vector;
+
+void sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
+			 float x, float y,
+			 struct tgsi_exec_vector *quadpos);
+
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
new file mode 100644
index 0000000000..453b0373f0
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -0,0 +1,164 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+#include "sp_headers.h"
+
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_parse.h"
+
+struct sp_exec_fragment_shader
+{
+   struct sp_fragment_shader base;
+};
+
+
+/** cast wrapper */
+static INLINE struct sp_exec_fragment_shader *
+sp_exec_fragment_shader(const struct sp_fragment_shader *base)
+{
+   return (struct sp_exec_fragment_shader *) base;
+}
+
+
+/**
+ * Compute quad X,Y,Z,W for the four fragments in a quad.
+ *
+ * This should really be part of the compiled shader.
+ */
+void
+sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
+		    float x, float y,
+		    struct tgsi_exec_vector *quadpos)
+{
+   uint chan;
+   /* do X */
+   quadpos->xyzw[0].f[0] = x;
+   quadpos->xyzw[0].f[1] = x + 1;
+   quadpos->xyzw[0].f[2] = x;
+   quadpos->xyzw[0].f[3] = x + 1;
+
+   /* do Y */
+   quadpos->xyzw[1].f[0] = y;
+   quadpos->xyzw[1].f[1] = y;
+   quadpos->xyzw[1].f[2] = y + 1;
+   quadpos->xyzw[1].f[3] = y + 1;
+
+   /* do Z and W for all fragments in the quad */
+   for (chan = 2; chan < 4; chan++) {
+      const float dadx = coef->dadx[chan];
+      const float dady = coef->dady[chan];
+      const float a0 = coef->a0[chan] + dadx * x + dady * y;
+      quadpos->xyzw[chan].f[0] = a0;
+      quadpos->xyzw[chan].f[1] = a0 + dadx;
+      quadpos->xyzw[chan].f[2] = a0 + dady;
+      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
+   }
+}
+
+
+static void
+exec_prepare( const struct sp_fragment_shader *base,
+	      struct tgsi_exec_machine *machine,
+	      struct tgsi_sampler **samplers )
+{
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (machine->Tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+   }
+}
+
+
+
+
+/* TODO: hide the machine struct in here somewhere, remove from this
+ * interface:
+ */
+static unsigned 
+exec_run( const struct sp_fragment_shader *base,
+	  struct tgsi_exec_machine *machine,
+	  struct quad_header *quad )
+{
+
+   /* Compute X, Y, Z, W vals for this quad */
+   sp_setup_pos_vector(quad->posCoef, 
+		       (float)quad->input.x0, (float)quad->input.y0, 
+		       &machine->QuadPos);
+   
+   return tgsi_exec_machine_run( machine );
+}
+
+
+
+static void 
+exec_delete( struct sp_fragment_shader *base )
+{
+   FREE((void *) base->shader.tokens);
+   FREE(base);
+}
+
+
+
+
+
+struct sp_fragment_shader *
+softpipe_create_fs_exec(struct softpipe_context *softpipe,
+			const struct pipe_shader_state *templ)
+{
+   struct sp_exec_fragment_shader *shader;
+
+   /* Decide whether we'll be codegenerating this shader and if so do
+    * that now.
+    */
+
+   shader = CALLOC_STRUCT(sp_exec_fragment_shader);
+   if (!shader)
+      return NULL;
+
+   /* we need to keep a local copy of the tokens */
+   shader->base.shader.tokens = tgsi_dup_tokens(templ->tokens);
+   shader->base.prepare = exec_prepare;
+   shader->base.run = exec_run;
+   shader->base.delete = exec_delete;
+
+   return &shader->base;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c
new file mode 100644
index 0000000000..34adac5226
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs_llvm.c
@@ -0,0 +1,200 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *   Zack Rusin
+ */
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "tgsi/tgsi_sse2.h"
+
+#if 0
+
+struct sp_llvm_fragment_shader {
+   struct sp_fragment_shader base;
+   struct gallivm_prog *llvm_prog;
+};
+
+static void
+shade_quad_llvm(struct quad_stage *qs,
+                struct quad_header *quad)
+{
+   struct quad_shade_stage *qss = quad_shade_stage(qs);
+   struct softpipe_context *softpipe = qs->softpipe;
+   float dests[4][16][4] ALIGN16_ATTRIB;
+   float inputs[4][16][4] ALIGN16_ATTRIB;
+   const float fx = (float) quad->x0;
+   const float fy = (float) quad->y0;
+   struct gallivm_prog *llvm = qss->llvm_prog;
+
+   inputs[0][0][0] = fx;
+   inputs[1][0][0] = fx + 1.0f;
+   inputs[2][0][0] = fx;
+   inputs[3][0][0] = fx + 1.0f;
+
+   inputs[0][0][1] = fy;
+   inputs[1][0][1] = fy;
+   inputs[2][0][1] = fy + 1.0f;
+   inputs[3][0][1] = fy + 1.0f;
+
+
+   gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef);
+
+#if DLLVM
+   debug_printf("MASK = %d\n", quad->mask);
+   for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 2; ++j) {
+         debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
+                inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]);
+      }
+   }
+#endif
+
+   quad->mask &=
+      gallivm_fragment_shader_exec(llvm, fx, fy, dests, inputs,
+                                   softpipe->mapped_constants[PIPE_SHADER_FRAGMENT],
+                                   qss->samplers);
+#if DLLVM
+   debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
+          dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], 
+          dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]);
+#endif
+
+   /* store result color */
+   if (qss->colorOutSlot >= 0) {
+      unsigned i;
+      /* XXX need to handle multiple color outputs someday */
+      allvmrt(qss->stage.softpipe->fs->info.output_semantic_name[qss->colorOutSlot]
+             == TGSI_SEMANTIC_COLOR);
+      for (i = 0; i < QUAD_SIZE; ++i) {
+         quad->outputs.color[0][0][i] = dests[i][qss->colorOutSlot][0];
+         quad->outputs.color[0][1][i] = dests[i][qss->colorOutSlot][1];
+         quad->outputs.color[0][2][i] = dests[i][qss->colorOutSlot][2];
+         quad->outputs.color[0][3][i] = dests[i][qss->colorOutSlot][3];
+      }
+   }
+#if DLLVM
+   for (int i = 0; i < QUAD_SIZE; ++i) {
+      debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
+             quad->outputs.color[0][0][i],
+             quad->outputs.color[0][1][i],
+             quad->outputs.color[0][2][i],
+             quad->outputs.color[0][3][i]);
+   }
+#endif
+
+   /* store result Z */
+   if (qss->depthOutSlot >= 0) {
+      /* output[slot] is new Z */
+      uint i;
+      for (i = 0; i < 4; i++) {
+         quad->outputs.depth[i] = dests[i][0][2];
+      }
+   }
+   else {
+      /* copy input Z (which was interpolated by the executor) to output Z */
+      uint i;
+      for (i = 0; i < 4; i++) {
+         quad->outputs.depth[i] = inputs[i][0][2];
+      }
+   }
+#if DLLVM
+   debug_printf("D [%f, %f, %f, %f] mask = %d\n",
+             quad->outputs.depth[0],
+             quad->outputs.depth[1],
+             quad->outputs.depth[2],
+             quad->outputs.depth[3], quad->mask);
+#endif
+
+   /* shader may cull fragments */
+   if( quad->mask ) {
+      qs->next->run( qs->next, quad );
+   }
+}
+
+
+unsigned 
+run_llvm_fs( const struct sp_fragment_shader *base,
+	     struct foo *machine )
+{
+}
+
+
+void 
+delete_llvm_fs( struct sp_fragment_shader *base )
+{
+   FREE(base);
+}
+
+
+struct sp_fragment_shader *
+softpipe_create_fs_llvm(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   struct sp_llvm_fragment_shader *shader = NULL;
+
+   /* LLVM fragment shaders currently disabled:
+    */
+   state = CALLOC_STRUCT(sp_llvm_shader_state);
+   if (!state)
+      return NULL;
+
+   state->llvm_prog = 0;
+
+   if (!gallivm_global_cpu_engine()) {
+      gallivm_cpu_engine_create(state->llvm_prog);
+   }
+   else
+      gallivm_cpu_jit_compile(gallivm_global_cpu_engine(), state->llvm_prog);
+   
+   if (shader) {
+      shader->base.run = run_llvm_fs;
+      shader->base.delete = delete_llvm_fs;
+   }
+
+   return shader;
+}
+
+
+#else
+
+struct sp_fragment_shader *
+softpipe_create_fs_llvm(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   return NULL;
+}
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
new file mode 100644
index 0000000000..9a273c8764
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -0,0 +1,169 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+#include "sp_headers.h"
+
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_sse2.h"
+
+
+#if defined(PIPE_ARCH_X86)
+
+#include "rtasm/rtasm_x86sse.h"
+
+/* Surely this should be defined somewhere in a tgsi header:
+ */
+typedef void (PIPE_CDECL *codegen_function)(
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   const float (*constant)[4],
+   struct tgsi_exec_vector *temporary,
+   const struct tgsi_interp_coef *coef,
+   float (*immediates)[4]
+   //, const struct tgsi_exec_vector *quadPos
+ );
+
+
+struct sp_sse_fragment_shader {
+   struct sp_fragment_shader base;
+   struct x86_function             sse2_program;
+   codegen_function func;
+   float immediates[TGSI_EXEC_NUM_IMMEDIATES][4];
+};
+
+
+
+static void
+fs_sse_prepare( const struct sp_fragment_shader *base,
+		struct tgsi_exec_machine *machine,
+		struct tgsi_sampler **samplers )
+{
+}
+
+
+/* TODO: codegenerate the whole run function, skip this wrapper.
+ * TODO: break dependency on tgsi_exec_machine struct
+ * TODO: push Position calculation into the generated shader
+ * TODO: process >1 quad at a time
+ */
+static unsigned 
+fs_sse_run( const struct sp_fragment_shader *base,
+	    struct tgsi_exec_machine *machine,
+	    struct quad_header *quad )
+{
+   struct sp_sse_fragment_shader *shader = (struct sp_sse_fragment_shader *) base;
+
+   /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
+   sp_setup_pos_vector(quad->posCoef, 
+		       (float)quad->input.x0, (float)quad->input.y0, 
+		       machine->Temps);
+
+   /* init kill mask */
+   tgsi_set_kill_mask(machine, 0x0);
+   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
+
+   shader->func( machine->Inputs,
+		 machine->Outputs,
+		 machine->Consts,
+		 machine->Temps,
+		 machine->InterpCoefs,
+                 shader->immediates
+		 //	 , &machine->QuadPos
+      );
+
+   return ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+}
+
+
+static void 
+fs_sse_delete( struct sp_fragment_shader *base )
+{
+   struct sp_sse_fragment_shader *shader = (struct sp_sse_fragment_shader *) base;
+
+   x86_release_func( &shader->sse2_program );
+   FREE(shader);
+}
+
+
+struct sp_fragment_shader *
+softpipe_create_fs_sse(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   struct sp_sse_fragment_shader *shader;
+
+   if (!softpipe->use_sse)
+      return NULL;
+
+   shader = CALLOC_STRUCT(sp_sse_fragment_shader);
+   if (!shader)
+      return NULL;
+
+   x86_init_func( &shader->sse2_program );
+   
+   if (!tgsi_emit_sse2( templ->tokens, &shader->sse2_program,
+                        shader->immediates, FALSE )) {
+      FREE(shader);
+      return NULL;
+   }
+
+   shader->func = (codegen_function) x86_get_func( &shader->sse2_program );
+   if (!shader->func) {
+      x86_release_func( &shader->sse2_program );
+      FREE(shader);
+      return NULL;
+   }
+
+   shader->base.shader = *templ;
+   shader->base.prepare = fs_sse_prepare;
+   shader->base.run = fs_sse_run;
+   shader->base.delete = fs_sse_delete;
+
+   return &shader->base;
+}
+
+
+#else
+
+/* Maybe put this varient in the header file.
+ */
+struct sp_fragment_shader *
+softpipe_create_fs_sse(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   return NULL;
+}
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_headers.h b/src/gallium/drivers/softpipe/sp_headers.h
new file mode 100644
index 0000000000..4a42cb3c19
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_headers.h
@@ -0,0 +1,95 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_HEADERS_H
+#define SP_HEADERS_H
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_exec.h"
+
+#define PRIM_POINT 1
+#define PRIM_LINE  2
+#define PRIM_TRI   3
+
+
+/* The rasterizer generates 2x2 quads of fragment and feeds them to
+ * the current fp_machine (see below).
+ * Remember that Y=0=top with Y increasing down the window.
+ */
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+/**
+ * Encodes everything we need to know about a 2x2 pixel block.  Uses
+ * "Channel-Serial" or "SoA" layout.  
+ */
+struct quad_header_input
+{
+   int x0;
+   int y0;
+   float coverage[QUAD_SIZE];    /** fragment coverage for antialiasing */
+   unsigned facing:1;   /**< Front (0) or back (1) facing? */
+   unsigned prim:2;     /**< PRIM_POINT, LINE, TRI */
+};
+
+struct quad_header_inout
+{
+   unsigned mask:4;
+};
+
+struct quad_header_output
+{
+   /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
+   float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+   float depth[QUAD_SIZE];
+};
+
+struct quad_header {
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+   struct quad_header_output output;
+
+   const struct tgsi_interp_coef *coef;
+   const struct tgsi_interp_coef *posCoef;
+
+   unsigned nr_attrs;
+};
+
+#endif /* SP_HEADERS_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
new file mode 100644
index 0000000000..038ff04d4f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_setup.c
@@ -0,0 +1,190 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief A draw stage that drives our triangle setup routines from
+ * within the draw pipeline.  One of two ways to drive setup, the
+ * other being in sp_prim_vbuf.c.
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ * \author  Brian Paul
+ */
+
+
+#include "sp_context.h"
+#include "sp_setup.h"
+#include "sp_state.h"
+#include "sp_prim_setup.h"
+#include "draw/draw_pipe.h"
+#include "draw/draw_vertex.h"
+#include "util/u_memory.h"
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct setup_context *setup;
+};
+
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+{
+   return (struct setup_stage *)stage;
+}
+
+
+typedef const float (*cptrf4)[4];
+
+static void
+do_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct setup_stage *setup = setup_stage( stage );
+   
+   setup_tri( setup->setup,
+              (cptrf4)prim->v[0]->data,
+              (cptrf4)prim->v[1]->data,
+              (cptrf4)prim->v[2]->data );
+}
+
+static void
+do_line(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct setup_stage *setup = setup_stage( stage );
+
+   setup_line( setup->setup,
+               (cptrf4)prim->v[0]->data,
+               (cptrf4)prim->v[1]->data );
+}
+
+static void
+do_point(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct setup_stage *setup = setup_stage( stage );
+
+   setup_point( setup->setup,
+                (cptrf4)prim->v[0]->data );
+}
+
+
+
+
+static void setup_begin( struct draw_stage *stage )
+{
+   struct setup_stage *setup = setup_stage(stage);
+
+   setup_prepare( setup->setup );
+
+   stage->point = do_point;
+   stage->line = do_line;
+   stage->tri = do_tri;
+}
+
+
+static void setup_first_point( struct draw_stage *stage,
+			       struct prim_header *header )
+{
+   setup_begin(stage);
+   stage->point( stage, header );
+}
+
+static void setup_first_line( struct draw_stage *stage,
+			       struct prim_header *header )
+{
+   setup_begin(stage);
+   stage->line( stage, header );
+}
+
+
+static void setup_first_tri( struct draw_stage *stage,
+			       struct prim_header *header )
+{
+   setup_begin(stage);
+   stage->tri( stage, header );
+}
+
+
+
+static void setup_flush( struct draw_stage *stage,
+			 unsigned flags )
+{
+   stage->point = setup_first_point;
+   stage->line = setup_first_line;
+   stage->tri = setup_first_tri;
+}
+
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+}
+
+
+static void render_destroy( struct draw_stage *stage )
+{
+   struct setup_stage *ssetup = setup_stage(stage);
+   setup_destroy_context(ssetup->setup);
+   FREE( stage );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.
+ */
+struct draw_stage *sp_draw_render_stage( struct softpipe_context *softpipe )
+{
+   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
+
+   sstage->setup = setup_create_context(softpipe);
+   sstage->stage.draw = softpipe->draw;
+   sstage->stage.point = setup_first_point;
+   sstage->stage.line = setup_first_line;
+   sstage->stage.tri = setup_first_tri;
+   sstage->stage.flush = setup_flush;
+   sstage->stage.reset_stipple_counter = reset_stipple_counter;
+   sstage->stage.destroy = render_destroy;
+
+   return (struct draw_stage *)sstage;
+}
+
+struct setup_context *
+sp_draw_setup_context( struct draw_stage *stage )
+{
+   struct setup_stage *ssetup = setup_stage(stage);
+   return ssetup->setup;
+}
+
+void
+sp_draw_flush( struct draw_stage *stage )
+{
+   stage->flush( stage, 0 );
+}
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.h b/src/gallium/drivers/softpipe/sp_prim_setup.h
new file mode 100644
index 0000000000..49bdd98ed8
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_setup.h
@@ -0,0 +1,85 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SP_PRIM_SETUP_H
+#define SP_PRIM_SETUP_H
+
+
+/**
+ * vbuf is a special stage to gather the stream of triangles, lines, points
+ * together and reconstruct vertex buffers for hardware upload.
+ *
+ * First attempt, work in progress.
+ * 
+ * TODO:
+ *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
+ *    - tell vbuf stage how to build hw vertices directly
+ *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
+ *
+ *
+ *
+ * Vertices are just an array of floats, with all the attributes
+ * packed.  We currently assume a layout like:
+ *
+ * attr[0][0..3] - window position
+ * attr[1..n][0..3] - remaining attributes.
+ *
+ * Attributes are assumed to be 4 floats wide but are packed so that
+ * all the enabled attributes run contiguously.
+ */
+
+
+struct draw_stage;
+struct softpipe_context;
+
+
+typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
+                                unsigned prim,
+                                const ushort *elements,
+                                unsigned nr_elements,
+                                const void *vertex_buffer,
+                                unsigned nr_vertices );
+
+
+extern struct draw_stage *
+sp_draw_render_stage( struct softpipe_context *softpipe );
+
+extern struct setup_context *
+sp_draw_setup_context( struct draw_stage * );
+
+extern void
+sp_draw_flush( struct draw_stage * );
+
+
+extern struct draw_stage *
+sp_draw_vbuf_stage( struct draw_context *draw_context,
+                    struct pipe_context *pipe,
+                    vbuf_draw_func draw );
+
+
+#endif /* SP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
new file mode 100644
index 0000000000..9cd5784e5b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -0,0 +1,410 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Interface between 'draw' module's output and the softpipe rasterizer/setup
+ * code.  When the 'draw' module has finished filling a vertex buffer, the
+ * draw_arrays() functions below will be called.  Loop over the vertices and
+ * call the point/line/tri setup functions.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_prim_vbuf.h"
+#include "sp_prim_setup.h"
+#include "sp_setup.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
+
+
+#define SP_MAX_VBUF_INDEXES 1024
+#define SP_MAX_VBUF_SIZE    4096
+
+typedef const float (*cptrf4)[4];
+
+/**
+ * Subclass of vbuf_render.
+ */
+struct softpipe_vbuf_render
+{
+   struct vbuf_render base;
+   struct softpipe_context *softpipe;
+   uint prim;
+   uint vertex_size;
+   void *vertex_buffer;
+};
+
+
+/** cast wrapper */
+static struct softpipe_vbuf_render *
+softpipe_vbuf_render(struct vbuf_render *vbr)
+{
+   return (struct softpipe_vbuf_render *) vbr;
+}
+
+
+static const struct vertex_info *
+sp_vbuf_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   return softpipe_get_vbuf_vertex_info(cvbr->softpipe);
+}
+
+
+static void *
+sp_vbuf_allocate_vertices(struct vbuf_render *vbr,
+                            ushort vertex_size, ushort nr_vertices)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   assert(!cvbr->vertex_buffer);
+   cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
+   cvbr->vertex_size = vertex_size;
+   return cvbr->vertex_buffer;
+}
+
+
+static void
+sp_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
+                           unsigned vertex_size, unsigned vertices_used)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   align_free(vertices);
+   assert(vertices == cvbr->vertex_buffer);
+   cvbr->vertex_buffer = NULL;
+}
+
+
+static boolean
+sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+
+   /* XXX: break this dependency - make setup_context live under
+    * softpipe, rename the old "setup" draw stage to something else.
+    */
+   struct setup_context *setup_ctx = sp_draw_setup_context(cvbr->softpipe->setup);
+   
+   setup_prepare( setup_ctx );
+
+
+
+   cvbr->prim = prim;
+   return TRUE;
+
+}
+
+
+static INLINE cptrf4 get_vert( const void *vertex_buffer,
+                               int index,
+                               int stride )
+{
+   return (cptrf4)((char *)vertex_buffer + index * stride);
+}
+
+
+/**
+ * draw elements / indexed primitives
+ */
+static void
+sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   struct softpipe_context *softpipe = cvbr->softpipe;
+   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer = cvbr->vertex_buffer;
+   unsigned i;
+
+   /* XXX: break this dependency - make setup_context live under
+    * softpipe, rename the old "setup" draw stage to something else.
+    */
+   struct draw_stage *setup = softpipe->setup;
+   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
+
+   switch (cvbr->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup_point( setup_ctx,
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[i-1], stride),
+                     get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[i-1], stride),
+                     get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[i-1], stride),
+                     get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      if (nr) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[nr-1], stride),
+                     get_vert(vertex_buffer, indices[0], stride) );
+      }
+      break;
+
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-2], stride),
+                    get_vert(vertex_buffer, indices[i-1], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
+                    get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[0], stride),
+                    get_vert(vertex_buffer, indices[i-1], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = 3; i < nr; i += 4) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-3], stride),
+                    get_vert(vertex_buffer, indices[i-2], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-2], stride),
+                    get_vert(vertex_buffer, indices[i-1], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 3; i < nr; i += 2) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-3], stride),
+                    get_vert(vertex_buffer, indices[i-2], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-1], stride),
+                    get_vert(vertex_buffer, indices[i-3], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   /* XXX: why are we calling this???  If we had to call something, it
+    * would be a function in sp_setup.c:
+    */
+   sp_draw_flush( setup );
+}
+
+
+/**
+ * This function is hit when the draw module is working in pass-through mode.
+ * It's up to us to convert the vertex array into point/line/tri prims.
+ */
+static void
+sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   struct softpipe_context *softpipe = cvbr->softpipe;
+   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer =
+      (void *) get_vert(cvbr->vertex_buffer, start, stride);
+   unsigned i;
+
+   /* XXX: break this dependency - make setup_context live under
+    * softpipe, rename the old "setup" draw stage to something else.
+    */
+   struct draw_stage *setup = softpipe->setup;
+   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
+
+   switch (cvbr->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup_point( setup_ctx,
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      if (nr) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, nr-1, stride),
+                     get_vert(vertex_buffer, 0, stride) );
+      }
+      break;
+
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-2, stride),
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i+(i&1)-2, stride),
+                    get_vert(vertex_buffer, i-(i&1)-1, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, 0, stride),
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = 3; i < nr; i += 4) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-3, stride),
+                    get_vert(vertex_buffer, i-2, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-2, stride),
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 3; i < nr; i += 2) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-3, stride),
+                    get_vert(vertex_buffer, i-2, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-3, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+sp_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   cvbr->softpipe->vbuf_render = NULL;
+   FREE(cvbr);
+}
+
+
+/**
+ * Initialize the post-transform vertex buffer information for the given
+ * context.
+ */
+void
+sp_init_vbuf(struct softpipe_context *sp)
+{
+   assert(sp->draw);
+
+   sp->vbuf_render = CALLOC_STRUCT(softpipe_vbuf_render);
+
+   sp->vbuf_render->base.max_indices = SP_MAX_VBUF_INDEXES;
+   sp->vbuf_render->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
+
+   sp->vbuf_render->base.get_vertex_info = sp_vbuf_get_vertex_info;
+   sp->vbuf_render->base.allocate_vertices = sp_vbuf_allocate_vertices;
+   sp->vbuf_render->base.set_primitive = sp_vbuf_set_primitive;
+   sp->vbuf_render->base.draw = sp_vbuf_draw;
+   sp->vbuf_render->base.draw_arrays = sp_vbuf_draw_arrays;
+   sp->vbuf_render->base.release_vertices = sp_vbuf_release_vertices;
+   sp->vbuf_render->base.destroy = sp_vbuf_destroy;
+
+   sp->vbuf_render->softpipe = sp;
+
+   sp->vbuf = draw_vbuf_stage(sp->draw, &sp->vbuf_render->base);
+
+   draw_set_rasterize_stage(sp->draw, sp->vbuf);
+
+   draw_set_render(sp->draw, &sp->vbuf_render->base);
+}
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.h b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
new file mode 100644
index 0000000000..1de9cc2a89
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_VBUF_H
+#define SP_VBUF_H
+
+
+struct softpipe_context;
+
+extern void
+sp_init_vbuf(struct softpipe_context *softpipe);
+
+
+#endif /* SP_VBUF_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad.c b/src/gallium/drivers/softpipe/sp_quad.c
new file mode 100644
index 0000000000..892ef87ee9
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad.c
@@ -0,0 +1,118 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "pipe/p_shader_tokens.h"
+
+static void
+sp_push_quad_first(
+   struct softpipe_context *sp,
+   struct quad_stage *quad,
+   uint i )
+{
+   quad->next = sp->quad[i].first;
+   sp->quad[i].first = quad;
+}
+
+static void
+sp_build_depth_stencil(
+   struct softpipe_context *sp,
+   uint i )
+{
+   if (sp->depth_stencil->stencil[0].enabled ||
+       sp->depth_stencil->stencil[1].enabled) {
+      sp_push_quad_first( sp, sp->quad[i].stencil_test, i );
+   }
+   else if (sp->depth_stencil->depth.enabled &&
+            sp->framebuffer.zsbuf) {
+      sp_push_quad_first( sp, sp->quad[i].depth_test, i );
+   }
+}
+
+void
+sp_build_quad_pipeline(struct softpipe_context *sp)
+{
+   uint i;
+
+   boolean early_depth_test =
+               sp->depth_stencil->depth.enabled &&
+               sp->framebuffer.zsbuf &&
+               !sp->depth_stencil->alpha.enabled &&
+               !sp->fs->info.uses_kill &&
+               !sp->fs->info.writes_z;
+
+   /* build up the pipeline in reverse order... */
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      sp->quad[i].first = sp->quad[i].output;
+
+      if (sp->blend->colormask != 0xf) {
+         sp_push_quad_first( sp, sp->quad[i].colormask, i );
+      }
+
+      if (sp->blend->blend_enable ||
+          sp->blend->logicop_enable) {
+         sp_push_quad_first( sp, sp->quad[i].blend, i );
+      }
+
+      if (sp->depth_stencil->depth.occlusion_count) {
+         sp_push_quad_first( sp, sp->quad[i].occlusion, i );
+      }
+
+      if (sp->rasterizer->poly_smooth ||
+          sp->rasterizer->line_smooth ||
+          sp->rasterizer->point_smooth) {
+         sp_push_quad_first( sp, sp->quad[i].coverage, i );
+      }
+
+      if (!early_depth_test) {
+         sp_build_depth_stencil( sp, i );
+      }
+
+      if (sp->depth_stencil->alpha.enabled) {
+         sp_push_quad_first( sp, sp->quad[i].alpha_test, i );
+      }
+
+      /* XXX always enable shader? */
+      if (1) {
+         sp_push_quad_first( sp, sp->quad[i].shade, i );
+      }
+
+      if (early_depth_test) {
+         sp_build_depth_stencil( sp, i );
+         sp_push_quad_first( sp, sp->quad[i].earlyz, i );
+      }
+
+#if !USE_DRAW_STAGE_PSTIPPLE
+      if (sp->rasterizer->poly_stipple_enable) {
+         sp_push_quad_first( sp, sp->quad[i].polygon_stipple, i );
+      }
+#endif
+   }
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
new file mode 100644
index 0000000000..08513cb95f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -0,0 +1,69 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_QUAD_H
+#define SP_QUAD_H
+
+
+struct softpipe_context;
+struct quad_header;
+
+
+struct quad_stage {
+   struct softpipe_context *softpipe;
+
+   struct quad_stage *next;
+
+   void (*begin)(struct quad_stage *qs);
+
+   /** the stage action */
+   void (*run)(struct quad_stage *qs, struct quad_header *quad);
+
+   void (*destroy)(struct quad_stage *qs);
+};
+
+
+struct quad_stage *sp_quad_polygon_stipple_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_earlyz_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_alpha_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe );
+
+void sp_build_quad_pipeline(struct softpipe_context *sp);
+
+void sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
+
+#endif /* SP_QUAD_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
new file mode 100644
index 0000000000..85c9f037a3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
@@ -0,0 +1,108 @@
+
+/**
+ * quad alpha test
+ */
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+
+
+static void
+alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   const float ref = softpipe->depth_stencil->alpha.ref_value;
+   unsigned passMask = 0x0, j;
+   const uint cbuf = 0; /* only output[0].alpha is tested */
+   const float *aaaa = quad->output.color[cbuf][3];
+
+   switch (softpipe->depth_stencil->alpha.func) {
+   case PIPE_FUNC_NEVER:
+      break;
+   case PIPE_FUNC_LESS:
+      /*
+       * If mask were an array [4] we could do this SIMD-style:
+       * passMask = (quad->outputs.color[0][3] <= vec4(ref));
+       */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] < ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] == ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] <= ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] > ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] != ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] >= ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      passMask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   quad->inout.mask &= passMask;
+
+   if (quad->inout.mask)
+      qs->next->run(qs->next, quad);
+}
+
+
+static void alpha_test_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void alpha_test_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *
+sp_quad_alpha_test_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = alpha_test_begin;
+   stage->run = alpha_test_quad;
+   stage->destroy = alpha_test_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
new file mode 100644
index 0000000000..fb1d430a4f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -0,0 +1,759 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * quad blending
+ * \author Brian Paul
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_tile_cache.h"
+#include "sp_quad.h"
+
+
+#define VEC4_COPY(DST, SRC) \
+do { \
+    DST[0] = SRC[0]; \
+    DST[1] = SRC[1]; \
+    DST[2] = SRC[2]; \
+    DST[3] = SRC[3]; \
+} while(0)
+
+#define VEC4_SCALAR(DST, SRC) \
+do { \
+    DST[0] = SRC; \
+    DST[1] = SRC; \
+    DST[2] = SRC; \
+    DST[3] = SRC; \
+} while(0)
+
+#define VEC4_ADD(R, A, B) \
+do { \
+   R[0] = A[0] + B[0]; \
+   R[1] = A[1] + B[1]; \
+   R[2] = A[2] + B[2]; \
+   R[3] = A[3] + B[3]; \
+} while (0)
+
+#define VEC4_SUB(R, A, B) \
+do { \
+   R[0] = A[0] - B[0]; \
+   R[1] = A[1] - B[1]; \
+   R[2] = A[2] - B[2]; \
+   R[3] = A[3] - B[3]; \
+} while (0)
+
+#define VEC4_MUL(R, A, B) \
+do { \
+   R[0] = A[0] * B[0]; \
+   R[1] = A[1] * B[1]; \
+   R[2] = A[2] * B[2]; \
+   R[3] = A[3] * B[3]; \
+} while (0)
+
+#define VEC4_MIN(R, A, B) \
+do { \
+   R[0] = (A[0] < B[0]) ? A[0] : B[0]; \
+   R[1] = (A[1] < B[1]) ? A[1] : B[1]; \
+   R[2] = (A[2] < B[2]) ? A[2] : B[2]; \
+   R[3] = (A[3] < B[3]) ? A[3] : B[3]; \
+} while (0)
+
+#define VEC4_MAX(R, A, B) \
+do { \
+   R[0] = (A[0] > B[0]) ? A[0] : B[0]; \
+   R[1] = (A[1] > B[1]) ? A[1] : B[1]; \
+   R[2] = (A[2] > B[2]) ? A[2] : B[2]; \
+   R[3] = (A[3] > B[3]) ? A[3] : B[3]; \
+} while (0)
+
+
+
+static void
+logicop_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      float dest[4][QUAD_SIZE];
+      ubyte src[4][4], dst[4][4], res[4][4];
+      uint *src4 = (uint *) src;
+      uint *dst4 = (uint *) dst;
+      uint *res4 = (uint *) res;
+      struct softpipe_cached_tile *
+         tile = sp_get_cached_tile(softpipe,
+                                   softpipe->cbuf_cache[cbuf],
+                                   quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      uint i, j;
+
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
+         }
+      }
+
+      /* convert to ubyte */
+      for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
+         dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+         dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+         dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+         dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+         src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+         src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+         src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+         src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
+      }
+
+      switch (softpipe->blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR:
+         for (j = 0; j < 4; j++)
+            res4[j] = 0;
+         break;
+      case PIPE_LOGICOP_NOR:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~(src4[j] | dst4[j]);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~src4[j] & dst4[j];
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~src4[j];
+         break;
+      case PIPE_LOGICOP_AND_REVERSE:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j] & ~dst4[j];
+         break;
+      case PIPE_LOGICOP_INVERT:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~dst4[j];
+         break;
+      case PIPE_LOGICOP_XOR:
+         for (j = 0; j < 4; j++)
+            res4[j] = dst4[j] ^ src4[j];
+         break;
+      case PIPE_LOGICOP_NAND:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~(src4[j] & dst4[j]);
+         break;
+      case PIPE_LOGICOP_AND:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j] & dst4[j];
+         break;
+      case PIPE_LOGICOP_EQUIV:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~(src4[j] ^ dst4[j]);
+         break;
+      case PIPE_LOGICOP_NOOP:
+         for (j = 0; j < 4; j++)
+            res4[j] = dst4[j];
+         break;
+      case PIPE_LOGICOP_OR_INVERTED:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~src4[j] | dst4[j];
+         break;
+      case PIPE_LOGICOP_COPY:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j];
+         break;
+      case PIPE_LOGICOP_OR_REVERSE:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j] | ~dst4[j];
+         break;
+      case PIPE_LOGICOP_OR:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j] | dst4[j];
+         break;
+      case PIPE_LOGICOP_SET:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~0;
+         break;
+      default:
+         assert(0);
+      }
+
+      for (j = 0; j < 4; j++) {
+         quadColor[j][0] = ubyte_to_float(res[j][0]);
+         quadColor[j][1] = ubyte_to_float(res[j][1]);
+         quadColor[j][2] = ubyte_to_float(res[j][2]);
+         quadColor[j][3] = ubyte_to_float(res[j][3]);
+      }
+   }
+
+   /* pass quad to next stage */
+   qs->next->run(qs->next, quad);
+}
+
+
+
+
+static void
+blend_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   static const float zero[4] = { 0, 0, 0, 0 };
+   static const float one[4] = { 1, 1, 1, 1 };
+
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   if (softpipe->blend->logicop_enable) {
+      logicop_quad(qs, quad);
+      return;
+   }
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      float source[4][QUAD_SIZE], dest[4][QUAD_SIZE];
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe,
+                              softpipe->cbuf_cache[cbuf],
+                              quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      uint i, j;
+
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
+         }
+      }
+
+      /*
+       * Compute src/first term RGB
+       */
+      switch (softpipe->blend->rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         VEC4_COPY(source[0], quadColor[0]); /* R */
+         VEC4_COPY(source[1], quadColor[1]); /* G */
+         VEC4_COPY(source[2], quadColor[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
+         VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
+         VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         {
+            const float *alpha = quadColor[3];
+            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
+         VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
+         VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         {
+            const float *alpha = dest[3];
+            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         {
+            const float *alpha = quadColor[3];
+            float diff[4], temp[4];
+            VEC4_SUB(diff, one, dest[3]);
+            VEC4_MIN(temp, alpha, diff);
+            VEC4_MUL(source[0], quadColor[0], temp); /* R */
+            VEC4_MUL(source[1], quadColor[1], temp); /* G */
+            VEC4_MUL(source[2], quadColor[2], temp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+            VEC4_MUL(source[0], quadColor[0], comp); /* R */
+            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+            VEC4_MUL(source[1], quadColor[1], comp); /* G */
+            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+            VEC4_MUL(source[2], quadColor[2], comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         {
+            float alpha[4];
+            VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
+            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         assert(0); /* to do */
+         break;
+      case PIPE_BLENDFACTOR_SRC1_ALPHA:
+         assert(0); /* to do */
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         VEC4_COPY(source[0], zero); /* R */
+         VEC4_COPY(source[1], zero); /* G */
+         VEC4_COPY(source[2], zero); /* B */
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SUB(inv_alpha, one, quadColor[3]);
+            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SUB(inv_alpha, one, dest[3]);
+            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, dest[0]); /* R */
+            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+            VEC4_SUB(inv_comp, one, dest[1]); /* G */
+            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+            VEC4_SUB(inv_comp, one, dest[2]); /* B */
+            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         {
+            float inv_comp[4];
+            /* R */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+            VEC4_MUL(source[0], quadColor[0], inv_comp);
+            /* G */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+            VEC4_MUL(source[1], quadColor[1], inv_comp);
+            /* B */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+            VEC4_MUL(source[2], quadColor[2], inv_comp);
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
+            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+         assert(0); /* to do */
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+         assert(0); /* to do */
+         break;
+      default:
+         assert(0);
+      }
+
+      /*
+       * Compute src/first term A
+       */
+      switch (softpipe->blend->alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         VEC4_COPY(source[3], quadColor[3]); /* A */
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         {
+            const float *alpha = quadColor[3];
+            VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         /* multiply alpha by 1.0 */
+         VEC4_COPY(source[3], quadColor[3]); /* A */
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+            VEC4_MUL(source[3], quadColor[3], comp); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         VEC4_COPY(source[3], zero); /* A */
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SUB(inv_alpha, one, quadColor[3]);
+            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SUB(inv_alpha, one, dest[3]);
+            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         {
+            float inv_comp[4];
+            /* A */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+            VEC4_MUL(source[3], quadColor[3], inv_comp);
+         }
+         break;
+      default:
+         assert(0);
+      }
+
+
+      /*
+       * Compute dest/second term RGB
+       */
+      switch (softpipe->blend->rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         /* dest = dest * 1   NO-OP, leave dest as-is */
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
+         VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
+         VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
+         VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
+         VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
+         VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
+         VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
+         VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
+         VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         assert(0); /* illegal */
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+            VEC4_MUL(dest[0], dest[0], comp); /* R */
+            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+            VEC4_MUL(dest[1], dest[1], comp); /* G */
+            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+            VEC4_MUL(dest[2], dest[2], comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+            VEC4_MUL(dest[0], dest[0], comp); /* R */
+            VEC4_MUL(dest[1], dest[1], comp); /* G */
+            VEC4_MUL(dest[2], dest[2], comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         VEC4_COPY(dest[0], zero); /* R */
+         VEC4_COPY(dest[1], zero); /* G */
+         VEC4_COPY(dest[2], zero); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_ALPHA:
+         /* XXX what are these? */
+         assert(0);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         {
+            float one_minus_alpha[QUAD_SIZE];
+            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+            VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+            VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+            VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, dest[3]); /* A */
+            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, dest[0]); /* R */
+            VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
+            VEC4_SUB(inv_comp, one, dest[1]); /* G */
+            VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
+            VEC4_SUB(inv_comp, one, dest[2]); /* B */
+            VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         {
+            float inv_comp[4];
+            /* R */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+            VEC4_MUL(dest[0], dest[0], inv_comp);
+            /* G */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+            VEC4_MUL(dest[1], dest[1], inv_comp);
+            /* B */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+            VEC4_MUL(dest[2], dest[2], inv_comp);
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         {
+            float inv_comp[4];
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+            VEC4_MUL(dest[0], dest[0], inv_comp);
+            VEC4_MUL(dest[1], dest[1], inv_comp);
+            VEC4_MUL(dest[2], dest[2], inv_comp);
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+         /* XXX what are these? */
+         assert(0);
+         break;
+      default:
+         assert(0);
+      }
+
+      /*
+       * Compute dest/second term A
+       */
+      switch (softpipe->blend->alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         /* dest = dest * 1   NO-OP, leave dest as-is */
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         assert(0); /* illegal */
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+            VEC4_MUL(dest[3], dest[3], comp); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         VEC4_COPY(dest[3], zero); /* A */
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         {
+            float one_minus_alpha[QUAD_SIZE];
+            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+            VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, dest[3]); /* A */
+            VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         {
+            float inv_comp[4];
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+            VEC4_MUL(dest[3], dest[3], inv_comp);
+         }
+         break;
+      default:
+         assert(0);
+      }
+
+      /*
+       * Combine RGB terms
+       */
+      switch (softpipe->blend->rgb_func) {
+      case PIPE_BLEND_ADD:
+         VEC4_ADD(quadColor[0], source[0], dest[0]); /* R */
+         VEC4_ADD(quadColor[1], source[1], dest[1]); /* G */
+         VEC4_ADD(quadColor[2], source[2], dest[2]); /* B */
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         VEC4_SUB(quadColor[0], source[0], dest[0]); /* R */
+         VEC4_SUB(quadColor[1], source[1], dest[1]); /* G */
+         VEC4_SUB(quadColor[2], source[2], dest[2]); /* B */
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         VEC4_SUB(quadColor[0], dest[0], source[0]); /* R */
+         VEC4_SUB(quadColor[1], dest[1], source[1]); /* G */
+         VEC4_SUB(quadColor[2], dest[2], source[2]); /* B */
+         break;
+      case PIPE_BLEND_MIN:
+         VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
+         VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
+         VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
+         break;
+      case PIPE_BLEND_MAX:
+         VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
+         VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
+         VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
+         break;
+      default:
+         assert(0);
+      }
+
+      /*
+       * Combine A terms
+       */
+      switch (softpipe->blend->alpha_func) {
+      case PIPE_BLEND_ADD:
+         VEC4_ADD(quadColor[3], source[3], dest[3]); /* A */
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         VEC4_SUB(quadColor[3], source[3], dest[3]); /* A */
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         VEC4_SUB(quadColor[3], dest[3], source[3]); /* A */
+         break;
+      case PIPE_BLEND_MIN:
+         VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
+         break;
+      case PIPE_BLEND_MAX:
+         VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
+         break;
+      default:
+         assert(0);
+      }
+
+   } /* cbuf loop */
+
+   /* pass blended quad to next stage */
+   qs->next->run(qs->next, quad);
+}
+
+
+static void blend_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void blend_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = blend_begin;
+   stage->run = blend_quad;
+   stage->destroy = blend_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
new file mode 100644
index 0000000000..d7d6a6974d
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
@@ -0,0 +1,74 @@
+
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+
+
+/**
+ * Loop over colorbuffers, passing quad to next stage each time.
+ */
+static void
+cbuf_loop_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   float tmp[PIPE_MAX_COLOR_BUFS][4][QUAD_SIZE];
+   unsigned i;
+
+   assert(sizeof(quad->outputs.color) == sizeof(tmp));
+   assert(softpipe->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
+
+   /* make copy of original colors since they can get modified
+    * by blending and masking.
+    * XXX we won't have to do this if the fragment program actually emits
+    * N separate colors and we're drawing to N color buffers (MRT).
+    * But if we emitted one color and glDrawBuffer(GL_FRONT_AND_BACK) is
+    * in effect, we need to save/restore colors like this.
+    */
+   memcpy(tmp, quad->outputs.color, sizeof(tmp));
+
+   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
+      /* set current cbuffer */
+#if 0 /* obsolete & going away */
+      softpipe->current_cbuf = i;
+#endif
+
+      /* pass blended quad to next stage */
+      qs->next->run(qs->next, quad);
+
+      /* restore quad's colors for next buffer */
+      memcpy(quad->outputs.color, tmp, sizeof(tmp));
+   }
+}
+
+
+static void cbuf_loop_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void cbuf_loop_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+/**
+ * Create the colorbuffer loop stage.
+ * This is used to implement multiple render targets and GL_FRONT_AND_BACK
+ * rendering.
+ */
+struct quad_stage *sp_quad_bufloop_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = cbuf_loop_begin;
+   stage->run = cbuf_loop_quad;
+   stage->destroy = cbuf_loop_destroy;
+
+   return stage;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
new file mode 100644
index 0000000000..563c2fc739
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_colormask.c
@@ -0,0 +1,116 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  quad colormask stage
+ * \author Brian Paul
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+#include "sp_tile_cache.h"
+
+
+
+/**
+ * XXX colormask could be rolled into blending...
+ */
+static void
+colormask_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      float dest[4][QUAD_SIZE];
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe,
+                              softpipe->cbuf_cache[cbuf],
+                              quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      uint i, j;
+
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
+         }
+      }
+
+      /* R */
+      if (!(softpipe->blend->colormask & PIPE_MASK_R))
+          COPY_4V(quadColor[0], dest[0]);
+
+      /* G */
+      if (!(softpipe->blend->colormask & PIPE_MASK_G))
+          COPY_4V(quadColor[1], dest[1]);
+
+      /* B */
+      if (!(softpipe->blend->colormask & PIPE_MASK_B))
+          COPY_4V(quadColor[2], dest[2]);
+
+      /* A */
+      if (!(softpipe->blend->colormask & PIPE_MASK_A))
+          COPY_4V(quadColor[3], dest[3]);
+   }
+
+   /* pass quad to next stage */
+   qs->next->run(qs->next, quad);
+}
+
+
+static void colormask_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void colormask_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = colormask_begin;
+   stage->run = colormask_quad;
+   stage->destroy = colormask_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
new file mode 100644
index 0000000000..c27fd1482d
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_coverage.c
@@ -0,0 +1,93 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * \brief  Apply AA coverage to quad alpha valus
+ * \author  Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+
+
+/**
+ * Multiply quad's alpha values by the fragment coverage.
+ */
+static void
+coverage_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+
+   if ((softpipe->rasterizer->poly_smooth && quad->input.prim == PRIM_TRI) ||
+       (softpipe->rasterizer->line_smooth && quad->input.prim == PRIM_LINE) ||
+       (softpipe->rasterizer->point_smooth && quad->input.prim == PRIM_POINT)) {
+      uint cbuf;
+
+      /* loop over colorbuffer outputs */
+      for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+         float (*quadColor)[4] = quad->output.color[cbuf];
+         unsigned j;
+         for (j = 0; j < QUAD_SIZE; j++) {
+            assert(quad->input.coverage[j] >= 0.0);
+            assert(quad->input.coverage[j] <= 1.0);
+         quadColor[3][j] *= quad->input.coverage[j];
+         }
+      }
+   }
+
+   qs->next->run(qs->next, quad);
+}
+
+
+static void coverage_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void coverage_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = coverage_begin;
+   stage->run = coverage_quad;
+   stage->destroy = coverage_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
new file mode 100644
index 0000000000..523bd3e080
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -0,0 +1,290 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Quad depth testing
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+#include "sp_tile_cache.h"
+
+
+/**
+ * Do depth testing for a quad.
+ * Not static since it's used by the stencil code.
+ */
+
+/*
+ * To increase efficiency, we should probably have multiple versions
+ * of this function that are specifically for Z16, Z32 and FP Z buffers.
+ * Try to effectively do that with codegen...
+ */
+
+void
+sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
+   const enum pipe_format format = ps->format;
+   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
+   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
+   unsigned zmask = 0;
+   unsigned j;
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
+
+   assert(ps); /* shouldn't get here if there's no zbuffer */
+
+   /*
+    * Convert quad's float depth values to int depth values (qzzzz).
+    * If the Z buffer stores integer values, we _have_ to do the depth
+    * compares with integers (not floats).  Otherwise, the float->int->float
+    * conversion of Z values (which isn't an identity function) will cause
+    * Z-fighting errors.
+    *
+    * Also, get the zbuffer values (bzzzz) from the cached tile.
+    */
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      {
+         float scale = 65535.0;
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            bzzzz[j] = tile->data.depth16[y][x];
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      {
+         double scale = (double) (uint) ~0UL;
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            bzzzz[j] = tile->data.depth32[y][x];
+         }
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /* fall-through */
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         float scale = (float) ((1 << 24) - 1);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+      /* fall-through */
+   case PIPE_FORMAT_Z24S8_UNORM:
+      {
+         float scale = (float) ((1 << 24) - 1);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            bzzzz[j] = tile->data.depth32[y][x] >> 8;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   switch (softpipe->depth_stencil->depth.func) {
+   case PIPE_FUNC_NEVER:
+      /* zmask = 0 */
+      break;
+   case PIPE_FUNC_LESS:
+      /* Note this is pretty much a single sse or cell instruction.  
+       * Like this:  quad->mask &= (quad->outputs.depth < zzzz);
+       */
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] < bzzzz[j]) 
+	    zmask |= 1 << j;
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] == bzzzz[j]) 
+	    zmask |= 1 << j;
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] <= bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] > bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] != bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] >= bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      zmask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   quad->inout.mask &= zmask;
+
+   if (softpipe->depth_stencil->depth.writemask) {
+      
+      /* This is also efficient with sse / spe instructions: 
+       */
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (quad->inout.mask & (1 << j)) {
+	    bzzzz[j] = qzzzz[j];
+	 }
+      }
+
+      /* put updated Z values back into cached tile */
+      switch (format) {
+      case PIPE_FORMAT_Z16_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            tile->data.depth16[y][x] = (ushort) bzzzz[j];
+         }
+         break;
+      case PIPE_FORMAT_X8Z24_UNORM:
+         /* fall-through */
+         /* (yes, this falls through to a different case than above) */
+      case PIPE_FORMAT_Z32_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            tile->data.depth32[y][x] = bzzzz[j];
+         }
+         break;
+      case PIPE_FORMAT_S8Z24_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            uint s8z24 = tile->data.depth32[y][x];
+            s8z24 = (s8z24 & 0xff000000) | bzzzz[j];
+            tile->data.depth32[y][x] = s8z24;
+         }
+         break;
+      case PIPE_FORMAT_Z24S8_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            uint z24s8 = tile->data.depth32[y][x];
+            z24s8 = (z24s8 & 0xff) | (bzzzz[j] << 8);
+            tile->data.depth32[y][x] = z24s8;
+         }
+         break;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            tile->data.depth32[y][x] = bzzzz[j] << 8;
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
+}
+
+
+static void
+depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   sp_depth_test_quad(qs, quad);
+
+   if (quad->inout.mask)
+      qs->next->run(qs->next, quad);
+}
+
+
+static void depth_test_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void depth_test_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = depth_test_begin;
+   stage->run = depth_test_quad;
+   stage->destroy = depth_test_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
new file mode 100644
index 0000000000..6e2dde304e
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
@@ -0,0 +1,88 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Quad early-z testing
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+
+
+/**
+ * All this stage does is compute the quad's Z values (which is normally
+ * done by the shading stage).
+ * The next stage will do the actual depth test.
+ */
+static void
+earlyz_quad(
+   struct quad_stage    *qs,
+   struct quad_header   *quad )
+{
+   const float fx = (float) quad->input.x0;
+   const float fy = (float) quad->input.y0;
+   const float dzdx = quad->posCoef->dadx[2];
+   const float dzdy = quad->posCoef->dady[2];
+   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+
+   quad->output.depth[0] = z0;
+   quad->output.depth[1] = z0 + dzdx;
+   quad->output.depth[2] = z0 + dzdy;
+   quad->output.depth[3] = z0 + dzdx + dzdy;
+
+   qs->next->run( qs->next, quad );
+}
+
+static void
+earlyz_begin(
+   struct quad_stage *qs )
+{
+   qs->next->begin( qs->next );
+}
+
+static void
+earlyz_destroy(
+   struct quad_stage *qs )
+{
+   FREE( qs );
+}
+
+struct quad_stage *
+sp_quad_earlyz_stage(
+   struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT( quad_stage );
+
+   stage->softpipe = softpipe;
+   stage->begin = earlyz_begin;
+   stage->run = earlyz_quad;
+   stage->destroy = earlyz_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
new file mode 100644
index 0000000000..5dacbbe55f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -0,0 +1,189 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Vertices are just an array of floats, with all the attributes
+ * packed.  We currently assume a layout like:
+ *
+ * attr[0][0..3] - window position
+ * attr[1..n][0..3] - remaining attributes.
+ *
+ * Attributes are assumed to be 4 floats wide but are packed so that
+ * all the enabled attributes run contiguously.
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+#include "sp_texture.h"
+#include "sp_tex_sample.h"
+
+
+struct quad_shade_stage
+{
+   struct quad_stage stage;  /**< base class */
+   struct tgsi_exec_machine machine;
+   struct tgsi_exec_vector *inputs, *outputs;
+};
+
+
+/** cast wrapper */
+static INLINE struct quad_shade_stage *
+quad_shade_stage(struct quad_stage *qs)
+{
+   return (struct quad_shade_stage *) qs;
+}
+
+
+
+/**
+ * Execute fragment shader for the four fragments in the quad.
+ */
+static void
+shade_quad(
+   struct quad_stage *qs,
+   struct quad_header *quad )
+{
+   struct quad_shade_stage *qss = quad_shade_stage( qs );
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct tgsi_exec_machine *machine = &qss->machine;
+   boolean z_written;
+   
+   /* Consts do not require 16 byte alignment. */
+   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
+
+   machine->InterpCoefs = quad->coef;
+
+   /* run shader */
+   quad->inout.mask &= softpipe->fs->run( softpipe->fs, 
+				    &qss->machine,
+				    quad );
+
+   /* store outputs */
+   z_written = FALSE;
+   {
+      const ubyte *sem_name = softpipe->fs->info.output_semantic_name;
+      const ubyte *sem_index = softpipe->fs->info.output_semantic_index;
+      const uint n = qss->stage.softpipe->fs->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
+               }
+               z_written = TRUE;
+            }
+            break;
+         }
+      }
+   }
+
+   if (!z_written) {
+      /* compute Z values now, as in the quad earlyz stage */
+      /* XXX we should really only do this if the earlyz stage is not used */
+      const float fx = (float) quad->input.x0;
+      const float fy = (float) quad->input.y0;
+      const float dzdx = quad->posCoef->dadx[2];
+      const float dzdy = quad->posCoef->dady[2];
+      const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+
+      quad->output.depth[0] = z0;
+      quad->output.depth[1] = z0 + dzdx;
+      quad->output.depth[2] = z0 + dzdy;
+      quad->output.depth[3] = z0 + dzdx + dzdy;
+   }
+
+   /* shader may cull fragments */
+   if( quad->inout.mask ) {
+      qs->next->run( qs->next, quad );
+   }
+}
+
+/**
+ * Per-primitive (or per-begin?) setup
+ */
+static void shade_begin(struct quad_stage *qs)
+{
+   struct quad_shade_stage *qss = quad_shade_stage(qs);
+   struct softpipe_context *softpipe = qs->softpipe;
+
+   softpipe->fs->prepare( softpipe->fs, 
+			  &qss->machine,
+			  (struct tgsi_sampler **)
+                             softpipe->tgsi.frag_samplers_list );
+
+   qs->next->begin(qs->next);
+}
+
+
+static void shade_destroy(struct quad_stage *qs)
+{
+   struct quad_shade_stage *qss = (struct quad_shade_stage *) qs;
+
+   tgsi_exec_machine_free_data(&qss->machine);
+   FREE( qss->inputs );
+   FREE( qss->outputs );
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe )
+{
+   struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage);
+
+   /* allocate storage for program inputs/outputs, aligned to 16 bytes */
+   qss->inputs = MALLOC(PIPE_MAX_ATTRIBS * sizeof(*qss->inputs) + 16);
+   qss->outputs = MALLOC(PIPE_MAX_ATTRIBS * sizeof(*qss->outputs) + 16);
+   qss->machine.Inputs = align16(qss->inputs);
+   qss->machine.Outputs = align16(qss->outputs);
+
+   qss->stage.softpipe = softpipe;
+   qss->stage.begin = shade_begin;
+   qss->stage.run = shade_quad;
+   qss->stage.destroy = shade_destroy;
+
+   tgsi_exec_machine_init( &qss->machine );
+
+   return &qss->stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
new file mode 100644
index 0000000000..169bd82876
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
@@ -0,0 +1,85 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * \brief  Quad occlusion counter stage
+ * \author  Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+
+static unsigned count_bits( unsigned val )
+{
+   unsigned i;
+
+   for (i = 0; val ; val >>= 1)
+      i += (val & 1);
+
+   return i;
+}
+
+static void
+occlusion_count_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+
+   softpipe->occlusion_count += count_bits(quad->inout.mask);
+
+   qs->next->run(qs->next, quad);
+}
+
+
+static void occlusion_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void occlusion_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = occlusion_begin;
+   stage->run = occlusion_count_quad;
+   stage->destroy = occlusion_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
new file mode 100644
index 0000000000..a37c8b4c39
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -0,0 +1,103 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+#include "sp_tile_cache.h"
+
+
+/**
+ * Last step of quad processing: write quad colors to the framebuffer,
+ * taking mask into account.
+ */
+static void
+output_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   /* in-tile pos: */
+   const int itx = quad->input.x0 % TILE_SIZE;
+   const int ity = quad->input.y0 % TILE_SIZE;
+
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe,
+                              softpipe->cbuf_cache[cbuf],
+                              quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      int i, j;
+
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
+            if (0) {
+               debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
+                            quad->input.x0 + x,
+                            quad->input.y0 + y,
+                            quadColor[0][j],
+                            quadColor[1][j],
+                            quadColor[2][j]);
+            }
+         }
+      }
+   }
+}
+
+
+static void output_begin(struct quad_stage *qs)
+{
+   assert(qs->next == NULL);
+}
+
+
+static void output_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = output_begin;
+   stage->run = output_quad;
+   stage->destroy = output_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
new file mode 100644
index 0000000000..7495515764
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_stencil.c
@@ -0,0 +1,352 @@
+
+/**
+ * \brief Quad stencil testing
+ */
+
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_tile_cache.h"
+#include "sp_quad.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+
+
+/** Only 8-bit stencil supported */
+#define STENCIL_MAX 0xff
+
+
+/**
+ * Do the basic stencil test (compare stencil buffer values against the
+ * reference value.
+ *
+ * \param stencilVals  the stencil values from the stencil buffer
+ * \param func  the stencil func (PIPE_FUNC_x)
+ * \param ref  the stencil reference value
+ * \param valMask  the stencil value mask indicating which bits of the stencil
+ *                 values and ref value are to be used.
+ * \return mask indicating which pixels passed the stencil test
+ */
+static unsigned
+do_stencil_test(const ubyte stencilVals[QUAD_SIZE], unsigned func,
+                unsigned ref, unsigned valMask)
+{
+   unsigned passMask = 0x0;
+   unsigned j;
+
+   ref &= valMask;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      /* passMask = 0x0 */
+      break;
+   case PIPE_FUNC_LESS:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref < (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref == (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref <= (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref > (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref != (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref >= (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      passMask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   return passMask;
+}
+
+
+/**
+ * Apply the stencil operator to stencil values.
+ *
+ * \param stencilVals  the stencil buffer values (read and written)
+ * \param mask  indicates which pixels to update
+ * \param op  the stencil operator (PIPE_STENCIL_OP_x)
+ * \param ref  the stencil reference value
+ * \param wrtMask  writemask controlling which bits are changed in the
+ *                 stencil values
+ */
+static void
+apply_stencil_op(ubyte stencilVals[QUAD_SIZE],
+                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
+{
+   unsigned j;
+   ubyte newstencil[QUAD_SIZE];
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      newstencil[j] = stencilVals[j];
+   }
+
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* no-op */
+      break;
+   case PIPE_STENCIL_OP_ZERO:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = 0;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ref;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (stencilVals[j] < STENCIL_MAX) {
+               newstencil[j] = stencilVals[j] + 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (stencilVals[j] > 0) {
+               newstencil[j] = stencilVals[j] - 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = stencilVals[j] + 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = stencilVals[j] - 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ~stencilVals[j];
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * update the stencil values
+    */
+   if (wrtMask != STENCIL_MAX) {
+      /* apply bit-wise stencil buffer writemask */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & stencilVals[j]);
+      }
+   }
+   else {
+      for (j = 0; j < QUAD_SIZE; j++) {
+         stencilVals[j] = newstencil[j];
+      }
+   }
+}
+
+
+/**
+ * Do stencil (and depth) testing.  Stenciling depends on the outcome of
+ * depth testing.
+ */
+static void
+stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
+   unsigned func, zFailOp, zPassOp, failOp;
+   ubyte ref, wrtMask, valMask;
+   ubyte stencilVals[QUAD_SIZE];
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
+   uint j;
+   uint face = quad->input.facing;
+
+   if (!softpipe->depth_stencil->stencil[1].enabled) {
+      /* single-sided stencil test, use front (face=0) state */
+      face = 0;
+   }
+
+   /* choose front or back face function, operator, etc */
+   /* XXX we could do these initializations once per primitive */
+   func    = softpipe->depth_stencil->stencil[face].func;
+   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
+   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
+   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
+   ref     = softpipe->depth_stencil->stencil[face].ref_value;
+   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
+   valMask = softpipe->depth_stencil->stencil[face].valuemask;
+
+   assert(ps); /* shouldn't get here if there's no stencil buffer */
+
+   /* get stencil values from cached tile */
+   switch (ps->format) {
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         stencilVals[j] = tile->data.depth32[y][x] >> 24;
+      }
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         stencilVals[j] = tile->data.depth32[y][x] & 0xff;
+      }
+      break;
+   case PIPE_FORMAT_S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         stencilVals[j] = tile->data.stencil8[y][x];
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   /* do the stencil test first */
+   {
+      unsigned passMask, failMask;
+      passMask = do_stencil_test(stencilVals, func, ref, valMask);
+      failMask = quad->inout.mask & ~passMask;
+      quad->inout.mask &= passMask;
+
+      if (failOp != PIPE_STENCIL_OP_KEEP) {
+         apply_stencil_op(stencilVals, failMask, failOp, ref, wrtMask);
+      }
+   }
+
+   if (quad->inout.mask) {
+      /* now the pixels that passed the stencil test are depth tested */
+      if (softpipe->depth_stencil->depth.enabled) {
+         const unsigned origMask = quad->inout.mask;
+
+         sp_depth_test_quad(qs, quad);  /* quad->mask is updated */
+
+         /* update stencil buffer values according to z pass/fail result */
+         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned failMask = origMask & ~quad->inout.mask;
+            apply_stencil_op(stencilVals, failMask, zFailOp, ref, wrtMask);
+         }
+
+         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned passMask = origMask & quad->inout.mask;
+            apply_stencil_op(stencilVals, passMask, zPassOp, ref, wrtMask);
+         }
+      }
+      else {
+         /* no depth test, apply Zpass operator to stencil buffer values */
+         apply_stencil_op(stencilVals, quad->inout.mask, zPassOp, ref, wrtMask);
+      }
+
+   }
+
+   /* put new stencil values into cached tile */
+   switch (ps->format) {
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         uint s8z24 = tile->data.depth32[y][x];
+         s8z24 = (stencilVals[j] << 24) | (s8z24 & 0xffffff);
+         tile->data.depth32[y][x] = s8z24;
+      }
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         uint z24s8 = tile->data.depth32[y][x];
+         z24s8 = (z24s8 & 0xffffff00) | stencilVals[j];
+         tile->data.depth32[y][x] = z24s8;
+      }
+      break;
+   case PIPE_FORMAT_S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.stencil8[y][x] = stencilVals[j];
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   if (quad->inout.mask)
+      qs->next->run(qs->next, quad);
+}
+
+
+static void stencil_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void stencil_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = stencil_begin;
+   stage->run = stencil_test_quad;
+   stage->destroy = stencil_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
new file mode 100644
index 0000000000..ccf37f6be5
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -0,0 +1,94 @@
+
+/**
+ * quad polygon stipple stage
+ */
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+
+
+/**
+ * Apply polygon stipple to quads produced by triangle rasterization
+ */
+static void
+stipple_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   static const uint bit31 = 1 << 31;
+   static const uint bit30 = 1 << 30;
+
+   if (quad->input.prim == PRIM_TRI) {
+      struct softpipe_context *softpipe = qs->softpipe;
+      /* need to invert Y to index into OpenGL's stipple pattern */
+      int y0, y1;
+      uint stipple0, stipple1;
+      if (softpipe->rasterizer->origin_lower_left) {
+         y0 = softpipe->framebuffer.height - 1 - quad->input.y0;
+         y1 = y0 - 1;
+      }
+      else {
+         y0 = quad->input.y0;
+         y1 = y0 + 1;
+      }
+      stipple0 = softpipe->poly_stipple.stipple[y0 % 32];
+      stipple1 = softpipe->poly_stipple.stipple[y1 % 32];
+
+#if 1
+      {
+      const int col0 = quad->input.x0 % 32;
+      if ((stipple0 & (bit31 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_TOP_LEFT;
+
+      if ((stipple0 & (bit30 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_TOP_RIGHT;
+
+      if ((stipple1 & (bit31 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_BOTTOM_LEFT;
+
+      if ((stipple1 & (bit30 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
+      }
+#else
+      /* We'd like to use this code, but we'd need to redefine
+       * MASK_TOP_LEFT to be (1 << 1) and MASK_TOP_RIGHT to be (1 << 0),
+       * and similarly for the BOTTOM bits.  But that may have undesirable
+       * side effects elsewhere.
+       */
+      const int col0 = 30 - (quad->input.x0 % 32);
+      quad->inout.mask &= (((stipple0 >> col0) & 0x3) | 
+                     (((stipple1 >> col0) & 0x3) << 2));
+#endif
+      if (!quad->inout.mask)
+         return;
+   }
+
+   qs->next->run(qs->next, quad);
+}
+
+
+static void stipple_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void stipple_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *
+sp_quad_polygon_stipple_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = stipple_begin;
+   stage->run = stipple_quad;
+   stage->destroy = stipple_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
new file mode 100644
index 0000000000..b0d8e01426
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -0,0 +1,107 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_query.h"
+
+struct softpipe_query {
+   uint64_t start;
+   uint64_t end;
+};
+
+
+static struct softpipe_query *softpipe_query( struct pipe_query *p )
+{
+   return (struct softpipe_query *)p;
+}
+
+static struct pipe_query *
+softpipe_create_query(struct pipe_context *pipe, 
+		      unsigned type)
+{
+   assert(type == PIPE_QUERY_OCCLUSION_COUNTER);
+   return (struct pipe_query *)CALLOC_STRUCT( softpipe_query );
+}
+
+
+static void
+softpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   FREE(q);
+}
+
+
+static void
+softpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   struct softpipe_query *sq = softpipe_query(q);
+   
+   sq->start = softpipe->occlusion_count;
+}
+
+
+static void
+softpipe_end_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   struct softpipe_query *sq = softpipe_query(q);
+
+   sq->end = softpipe->occlusion_count;
+}
+
+
+static boolean
+softpipe_get_query_result(struct pipe_context *pipe, 
+			  struct pipe_query *q,
+			  boolean wait,
+			  uint64_t *result )
+{
+   struct softpipe_query *sq = softpipe_query(q);
+   *result = sq->end - sq->start;
+   return TRUE;
+}
+
+
+void softpipe_init_query_funcs(struct softpipe_context *softpipe )
+{
+   softpipe->pipe.create_query = softpipe_create_query;
+   softpipe->pipe.destroy_query = softpipe_destroy_query;
+   softpipe->pipe.begin_query = softpipe_begin_query;
+   softpipe->pipe.end_query = softpipe_end_query;
+   softpipe->pipe.get_query_result = softpipe_get_query_result;
+}
+
+
diff --git a/src/gallium/drivers/softpipe/sp_query.h b/src/gallium/drivers/softpipe/sp_query.h
new file mode 100644
index 0000000000..05060a4575
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_query.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell
+ */
+
+#ifndef SP_QUERY_H
+#define SP_QUERY_H
+
+struct softpipe_context;
+extern void softpipe_init_query_funcs(struct softpipe_context * );
+
+
+#endif /* SP_QUERY_H */
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
new file mode 100644
index 0000000000..7380a6ae2b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -0,0 +1,181 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_simple_screen.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+
+#include "sp_texture.h"
+#include "sp_winsys.h"
+#include "sp_screen.h"
+
+
+static const char *
+softpipe_get_vendor(struct pipe_screen *screen)
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+softpipe_get_name(struct pipe_screen *screen)
+{
+   return "softpipe";
+}
+
+
+static int
+softpipe_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return PIPE_MAX_SAMPLERS;
+   case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+      return PIPE_MAX_SAMPLERS;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 1;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return PIPE_MAX_COLOR_BUFS;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 12; /* max 2Kx2K */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 12; /* max 2Kx2K */
+   default:
+      return 0;
+   }
+}
+
+
+static float
+softpipe_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 0.0;
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+   default:
+      return 0;
+   }
+}
+
+
+/**
+ * Query format support for creating a texture, drawing surface, etc.
+ * \param format  the format to test
+ * \param type  one of PIPE_TEXTURE, PIPE_SURFACE
+ */
+static boolean
+softpipe_is_format_supported( struct pipe_screen *screen,
+                              enum pipe_format format, 
+                              enum pipe_texture_target target,
+                              unsigned tex_usage, 
+                              unsigned geom_flags )
+{
+   switch(format) {
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_RGBA:
+   case PIPE_FORMAT_DXT3_RGBA:
+   case PIPE_FORMAT_DXT5_RGBA:
+      return FALSE;
+   default:
+      return TRUE;
+   }
+}
+
+
+static void
+softpipe_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no softpipe_screen).
+ */
+struct pipe_screen *
+softpipe_create_screen(struct pipe_winsys *winsys)
+{
+   struct softpipe_screen *screen = CALLOC_STRUCT(softpipe_screen);
+
+   if (!screen)
+      return NULL;
+
+   screen->base.winsys = winsys;
+
+   screen->base.destroy = softpipe_destroy_screen;
+
+   screen->base.get_name = softpipe_get_name;
+   screen->base.get_vendor = softpipe_get_vendor;
+   screen->base.get_param = softpipe_get_param;
+   screen->base.get_paramf = softpipe_get_paramf;
+   screen->base.is_format_supported = softpipe_is_format_supported;
+
+   softpipe_init_screen_texture_funcs(&screen->base);
+   u_simple_screen_init(&screen->base);
+
+   return &screen->base;
+}
diff --git a/src/gallium/drivers/softpipe/sp_screen.h b/src/gallium/drivers/softpipe/sp_screen.h
new file mode 100644
index 0000000000..3d4bfd3e84
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_screen.h
@@ -0,0 +1,58 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_SCREEN_H
+#define SP_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+
+
+
+struct softpipe_screen {
+   struct pipe_screen base;
+
+   /* Increments whenever textures are modified.  Contexts can track
+    * this.
+    */
+   unsigned timestamp;          
+};
+
+
+
+
+static INLINE struct softpipe_screen *
+softpipe_screen( struct pipe_screen *pipe )
+{
+   return (struct softpipe_screen *)pipe;
+}
+
+
+#endif /* SP_SCREEN_H */
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
new file mode 100644
index 0000000000..b1adb9cb7a
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -0,0 +1,1569 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \brief  Primitive rasterization/rendering (points, lines, triangles)
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ * \author  Brian Paul
+ */
+
+#include "sp_setup.h"
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+#include "sp_state.h"
+#include "sp_prim_setup.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+#define DEBUG_VERTS 0
+#define DEBUG_FRAGS 0
+
+/**
+ * Triangle edge info
+ */
+struct edge {
+   float dx;		/**< X(v1) - X(v0), used only during setup */
+   float dy;		/**< Y(v1) - Y(v0), used only during setup */
+   float dxdy;		/**< dx/dy */
+   float sx, sy;	/**< first sample point coord */
+   int lines;		/**< number of lines on this edge */
+};
+
+#if SP_NUM_QUAD_THREADS > 1
+
+/* Set to 1 if you want other threads to be instantly
+ * notified of pending jobs.
+ */
+#define INSTANT_NOTEMPTY_NOTIFY 0
+
+struct thread_info
+{
+   struct setup_context *setup;
+   uint id;
+   pipe_thread handle;
+};
+
+struct quad_job;
+
+typedef void (* quad_job_routine)( struct setup_context *setup, uint thread, struct quad_job *job );
+
+struct quad_job
+{
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+   quad_job_routine routine;
+};
+
+#define NUM_QUAD_JOBS 64
+
+struct quad_job_que
+{
+   struct quad_job jobs[NUM_QUAD_JOBS];
+   uint first;
+   uint last;
+   pipe_mutex que_mutex;
+   pipe_condvar que_notfull_condvar;
+   pipe_condvar que_notempty_condvar;
+   uint jobs_added;
+   uint jobs_done;
+   pipe_condvar que_done_condvar;
+};
+
+static void
+add_quad_job( struct quad_job_que *que, struct quad_header *quad, quad_job_routine routine )
+{
+#if INSTANT_NOTEMPTY_NOTIFY
+   boolean empty;
+#endif
+
+   /* Wait for empty slot, see if the que is empty.
+    */
+   pipe_mutex_lock( que->que_mutex );
+   while ((que->last + 1) % NUM_QUAD_JOBS == que->first) {
+#if !INSTANT_NOTEMPTY_NOTIFY
+      pipe_condvar_broadcast( que->que_notempty_condvar );
+#endif
+      pipe_condvar_wait( que->que_notfull_condvar, que->que_mutex );
+   }
+#if INSTANT_NOTEMPTY_NOTIFY
+   empty = que->last == que->first;
+#endif
+   que->jobs_added++;
+   pipe_mutex_unlock( que->que_mutex );
+
+   /* Submit new job.
+    */
+   que->jobs[que->last].input = quad->input;
+   que->jobs[que->last].inout = quad->inout;
+   que->jobs[que->last].routine = routine;
+   que->last = (que->last + 1) % NUM_QUAD_JOBS;
+
+#if INSTANT_NOTEMPTY_NOTIFY
+   /* If the que was empty, notify consumers there's a job to be done.
+    */
+   if (empty) {
+      pipe_mutex_lock( que->que_mutex );
+      pipe_condvar_broadcast( que->que_notempty_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+   }
+#endif
+}
+
+#endif
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_context {
+   struct softpipe_context *softpipe;
+
+   /* Vertices are just an array of floats making up each attribute in
+    * turn.  Currently fixed at 4 floats, but should change in time.
+    * Codegen will help cope with this.
+    */
+   const float (*vmax)[4];
+   const float (*vmid)[4];
+   const float (*vmin)[4];
+   const float (*vprovoke)[4];
+
+   struct edge ebot;
+   struct edge etop;
+   struct edge emaj;
+
+   float oneoverarea;
+
+   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+   struct tgsi_interp_coef posCoef;  /* For Z, W */
+   struct quad_header quad;
+
+#if SP_NUM_QUAD_THREADS > 1
+   struct quad_job_que que;
+   struct thread_info threads[SP_NUM_QUAD_THREADS];
+#endif
+
+   struct {
+      int left[2];   /**< [0] = row0, [1] = row1 */
+      int right[2];
+      int y;
+      unsigned y_flags;
+      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
+   } span;
+
+#if DEBUG_FRAGS
+   uint numFragsEmitted;  /**< per primitive */
+   uint numFragsWritten;  /**< per primitive */
+#endif
+
+   unsigned winding;		/* which winding to cull */
+};
+
+#if SP_NUM_QUAD_THREADS > 1
+
+static PIPE_THREAD_ROUTINE( quad_thread, param )
+{
+   struct thread_info *info = (struct thread_info *) param;
+   struct quad_job_que *que = &info->setup->que;
+
+   for (;;) {
+      struct quad_job job;
+      boolean full;
+
+      /* Wait for an available job.
+       */
+      pipe_mutex_lock( que->que_mutex );
+      while (que->last == que->first)
+         pipe_condvar_wait( que->que_notempty_condvar, que->que_mutex );
+
+      /* See if the que is full.
+       */
+      full = (que->last + 1) % NUM_QUAD_JOBS == que->first;
+
+      /* Take a job and remove it from que.
+       */
+      job = que->jobs[que->first];
+      que->first = (que->first + 1) % NUM_QUAD_JOBS;
+
+      /* Notify the producer if the que is not full.
+       */
+      if (full)
+         pipe_condvar_signal( que->que_notfull_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+
+      job.routine( info->setup, info->id, &job );
+
+      /* Notify the producer if that's the last finished job.
+       */
+      pipe_mutex_lock( que->que_mutex );
+      que->jobs_done++;
+      if (que->jobs_added == que->jobs_done)
+         pipe_condvar_signal( que->que_done_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+   }
+
+   return NULL;
+}
+
+#define WAIT_FOR_COMPLETION(setup) \
+   do {\
+      pipe_mutex_lock( setup->que.que_mutex );\
+      if (!INSTANT_NOTEMPTY_NOTIFY)\
+         pipe_condvar_broadcast( setup->que.que_notempty_condvar );\
+      while (setup->que.jobs_added != setup->que.jobs_done)\
+         pipe_condvar_wait( setup->que.que_done_condvar, setup->que.que_mutex );\
+      pipe_mutex_unlock( setup->que.que_mutex );\
+   } while (0)
+
+#else
+
+#define WAIT_FOR_COMPLETION(setup) ((void) 0)
+
+#endif
+
+/**
+ * Test if x is NaN or +/- infinity.
+ */
+static INLINE boolean
+is_inf_or_nan(float x)
+{
+   union fi tmp;
+   tmp.f = x;
+   return !(int)((unsigned int)((tmp.i & 0x7fffffff)-0x7f800000) >> 31);
+}
+
+
+static boolean cull_tri( struct setup_context *setup,
+		      float det )
+{
+   if (det != 0) 
+   {   
+      /* if (det < 0 then Z points toward camera and triangle is 
+       * counter-clockwise winding.
+       */
+      unsigned winding = (det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
+      
+      if ((winding & setup->winding) == 0) 
+	 return FALSE;
+   }
+
+   /* Culled:
+    */
+   return TRUE;
+}
+
+
+
+/**
+ * Clip setup->quad against the scissor/surface bounds.
+ */
+static INLINE void
+quad_clip( struct setup_context *setup, struct quad_header *quad )
+{
+   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+
+   if (quad->input.x0 >= maxx ||
+       quad->input.y0 >= maxy ||
+       quad->input.x0 + 1 < minx ||
+       quad->input.y0 + 1 < miny) {
+      /* totally clipped */
+      quad->inout.mask = 0x0;
+      return;
+   }
+   if (quad->input.x0 < minx)
+      quad->inout.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (quad->input.y0 < miny)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (quad->input.x0 == maxx - 1)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (quad->input.y0 == maxy - 1)
+      quad->inout.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+}
+
+
+/**
+ * Emit a quad (pass to next stage) with clipping.
+ */
+static INLINE void
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+{
+   quad_clip( setup, quad );
+   if (quad->inout.mask) {
+      struct softpipe_context *sp = setup->softpipe;
+
+      sp->quad[thread].first->run( sp->quad[thread].first, quad );
+   }
+}
+
+#if SP_NUM_QUAD_THREADS > 1
+
+static void
+clip_emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
+{
+   struct quad_header quad;
+
+   quad.input = job->input;
+   quad.inout = job->inout;
+   quad.coef = setup->quad.coef;
+   quad.posCoef = setup->quad.posCoef;
+   quad.nr_attrs = setup->quad.nr_attrs;
+   clip_emit_quad( setup, &quad, thread );
+}
+
+#define CLIP_EMIT_QUAD(setup) add_quad_job( &setup->que, &setup->quad, clip_emit_quad_job )
+
+#else
+
+#define CLIP_EMIT_QUAD(setup) clip_emit_quad( setup, &setup->quad, 0 )
+
+#endif
+
+/**
+ * Emit a quad (pass to next stage).  No clipping is done.
+ */
+static INLINE void
+emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+{
+   struct softpipe_context *sp = setup->softpipe;
+#if DEBUG_FRAGS
+   uint mask = quad->inout.mask;
+#endif
+
+#if DEBUG_FRAGS
+   if (mask & 1) setup->numFragsEmitted++;
+   if (mask & 2) setup->numFragsEmitted++;
+   if (mask & 4) setup->numFragsEmitted++;
+   if (mask & 8) setup->numFragsEmitted++;
+#endif
+   sp->quad[thread].first->run( sp->quad[thread].first, quad );
+#if DEBUG_FRAGS
+   mask = quad->inout.mask;
+   if (mask & 1) setup->numFragsWritten++;
+   if (mask & 2) setup->numFragsWritten++;
+   if (mask & 4) setup->numFragsWritten++;
+   if (mask & 8) setup->numFragsWritten++;
+#endif
+}
+
+#if SP_NUM_QUAD_THREADS > 1
+
+static void
+emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
+{
+   struct quad_header quad;
+
+   quad.input = job->input;
+   quad.inout = job->inout;
+   quad.coef = setup->quad.coef;
+   quad.posCoef = setup->quad.posCoef;
+   quad.nr_attrs = setup->quad.nr_attrs;
+   emit_quad( setup, &quad, thread );
+}
+
+#define EMIT_QUAD(setup,x,y,mask) do {\
+      setup->quad.input.x0 = x;\
+      setup->quad.input.y0 = y;\
+      setup->quad.inout.mask = mask;\
+      add_quad_job( &setup->que, &setup->quad, emit_quad_job );\
+   } while (0)
+
+#else
+
+#define EMIT_QUAD(setup,x,y,mask) do {\
+      setup->quad.input.x0 = x;\
+      setup->quad.input.y0 = y;\
+      setup->quad.inout.mask = mask;\
+      emit_quad( setup, &setup->quad, 0 );\
+   } while (0)
+
+#endif
+
+/**
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
+ */
+static INLINE int block( int x )
+{
+   return x & ~1;
+}
+
+
+/**
+ * Render a horizontal span of quads
+ */
+static void flush_spans( struct setup_context *setup )
+{
+   const int xleft0 = setup->span.left[0];
+   const int xleft1 = setup->span.left[1];
+   const int xright0 = setup->span.right[0];
+   const int xright1 = setup->span.right[1];
+   int minleft, maxright;
+   int x;
+
+   switch (setup->span.y_flags) {
+   case 0x3:
+      /* both odd and even lines written (both quad rows) */
+      minleft = block(MIN2(xleft0, xleft1));
+      maxright = block(MAX2(xright0, xright1));
+      for (x = minleft; x <= maxright; x += 2) {
+         /* determine which of the four pixels is inside the span bounds */
+         uint mask = 0x0;
+         if (x >= xleft0 && x < xright0)
+            mask |= MASK_TOP_LEFT;
+         if (x >= xleft1 && x < xright1)
+            mask |= MASK_BOTTOM_LEFT;
+         if (x+1 >= xleft0 && x+1 < xright0)
+            mask |= MASK_TOP_RIGHT;
+         if (x+1 >= xleft1 && x+1 < xright1)
+            mask |= MASK_BOTTOM_RIGHT;
+         EMIT_QUAD( setup, x, setup->span.y, mask );
+      }
+      break;
+
+   case 0x1:
+      /* only even line written (quad top row) */
+      minleft = block(xleft0);
+      maxright = block(xright0);
+      for (x = minleft; x <= maxright; x += 2) {
+         uint mask = 0x0;
+         if (x >= xleft0 && x < xright0)
+            mask |= MASK_TOP_LEFT;
+         if (x+1 >= xleft0 && x+1 < xright0)
+            mask |= MASK_TOP_RIGHT;
+         EMIT_QUAD( setup, x, setup->span.y, mask );
+      }
+      break;
+
+   case 0x2:
+      /* only odd line written (quad bottom row) */
+      minleft = block(xleft1);
+      maxright = block(xright1);
+      for (x = minleft; x <= maxright; x += 2) {
+         uint mask = 0x0;
+         if (x >= xleft1 && x < xright1)
+            mask |= MASK_BOTTOM_LEFT;
+         if (x+1 >= xleft1 && x+1 < xright1)
+            mask |= MASK_BOTTOM_RIGHT;
+         EMIT_QUAD( setup, x, setup->span.y, mask );
+      }
+      break;
+
+   default:
+      return;
+   }
+
+   setup->span.y = 0;
+   setup->span.y_flags = 0;
+   setup->span.right[0] = 0;
+   setup->span.right[1] = 0;
+}
+
+
+#if DEBUG_VERTS
+static void print_vertex(const struct setup_context *setup,
+                         const float (*v)[4])
+{
+   int i;
+   debug_printf("   Vertex: (%p)\n", v);
+   for (i = 0; i < setup->quad.nr_attrs; i++) {
+      debug_printf("     %d: %f %f %f %f\n",  i,
+              v[i][0], v[i][1], v[i][2], v[i][3]);
+   }
+}
+#endif
+
+/**
+ * \return FALSE if coords are inf/nan (cull the tri), TRUE otherwise
+ */
+static boolean setup_sort_vertices( struct setup_context *setup,
+                                    float det,
+                                    const float (*v0)[4],
+                                    const float (*v1)[4],
+                                    const float (*v2)[4] )
+{
+   setup->vprovoke = v2;
+
+   /* determine bottom to top order of vertices */
+   {
+      float y0 = v0[0][1];
+      float y1 = v1[0][1];
+      float y2 = v2[0][1];
+      if (y0 <= y1) {
+	 if (y1 <= y2) {
+	    /* y0<=y1<=y2 */
+	    setup->vmin = v0;
+	    setup->vmid = v1;
+	    setup->vmax = v2;
+	 }
+	 else if (y2 <= y0) {
+	    /* y2<=y0<=y1 */
+	    setup->vmin = v2;
+	    setup->vmid = v0;
+	    setup->vmax = v1;
+	 }
+	 else {
+	    /* y0<=y2<=y1 */
+	    setup->vmin = v0;
+	    setup->vmid = v2;
+	    setup->vmax = v1;
+	 }
+      }
+      else {
+	 if (y0 <= y2) {
+	    /* y1<=y0<=y2 */
+	    setup->vmin = v1;
+	    setup->vmid = v0;
+	    setup->vmax = v2;
+	 }
+	 else if (y2 <= y1) {
+	    /* y2<=y1<=y0 */
+	    setup->vmin = v2;
+	    setup->vmid = v1;
+	    setup->vmax = v0;
+	 }
+	 else {
+	    /* y1<=y2<=y0 */
+	    setup->vmin = v1;
+	    setup->vmid = v2;
+	    setup->vmax = v0;
+	 }
+      }
+   }
+
+   setup->ebot.dx = setup->vmid[0][0] - setup->vmin[0][0];
+   setup->ebot.dy = setup->vmid[0][1] - setup->vmin[0][1];
+   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
+   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
+   setup->etop.dx = setup->vmax[0][0] - setup->vmid[0][0];
+   setup->etop.dy = setup->vmax[0][1] - setup->vmid[0][1];
+
+   /*
+    * Compute triangle's area.  Use 1/area to compute partial
+    * derivatives of attributes later.
+    *
+    * The area will be the same as prim->det, but the sign may be
+    * different depending on how the vertices get sorted above.
+    *
+    * To determine whether the primitive is front or back facing we
+    * use the prim->det value because its sign is correct.
+    */
+   {
+      const float area = (setup->emaj.dx * setup->ebot.dy -
+			    setup->ebot.dx * setup->emaj.dy);
+
+      setup->oneoverarea = 1.0f / area;
+
+      /*
+      debug_printf("%s one-over-area %f  area %f  det %f\n",
+                   __FUNCTION__, setup->oneoverarea, area, det );
+      */
+      if (is_inf_or_nan(setup->oneoverarea))
+         return FALSE;
+   }
+
+   /* We need to know if this is a front or back-facing triangle for:
+    *  - the GLSL gl_FrontFacing fragment attribute (bool)
+    *  - two-sided stencil test
+    */
+   setup->quad.input.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+
+   return TRUE;
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex[slot][i].
+ * The result will be put into setup->coef[slot].a0[i].
+ * \param slot  which attribute slot
+ * \param i  which component of the slot (0..3)
+ */
+static void const_coeff( struct setup_context *setup,
+                         struct tgsi_interp_coef *coef,
+                         uint vertSlot, uint i)
+{
+   assert(i <= 3);
+
+   coef->dadx[i] = 0;
+   coef->dady[i] = 0;
+
+   /* need provoking vertex info!
+    */
+   coef->a0[i] = setup->vprovoke[vertSlot][i];
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void tri_linear_coeff( struct setup_context *setup,
+                              struct tgsi_interp_coef *coef,
+                              uint vertSlot, uint i)
+{
+   float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
+   float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float dadx = a * setup->oneoverarea;
+   float dady = b * setup->oneoverarea;
+
+   assert(i <= 3);
+
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (0.5, 0.5).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   coef->a0[i] = (setup->vmin[vertSlot][i] -
+                  (dadx * (setup->vmin[0][0] - 0.5f) +
+                   dady * (setup->vmin[0][1] - 0.5f)));
+
+   /*
+   debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+		slot, "xyzw"[i],
+		setup->coef[slot].a0[i],
+		setup->coef[slot].dadx[i],
+		setup->coef[slot].dady[i]);
+   */
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void tri_persp_coeff( struct setup_context *setup,
+                             struct tgsi_interp_coef *coef,
+                             uint vertSlot, uint i)
+{
+   /* premultiply by 1/w  (v[0][3] is always W):
+    */
+   float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+   float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
+   float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+   float botda = mida - mina;
+   float majda = maxa - mina;
+   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float dadx = a * setup->oneoverarea;
+   float dady = b * setup->oneoverarea;
+
+   /*
+   debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+          	setup->vmin[vertSlot][i],
+          	setup->vmid[vertSlot][i],
+       		setup->vmax[vertSlot][i]
+          );
+   */
+   assert(i <= 3);
+
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (mina -
+                  (dadx * (setup->vmin[0][0] - 0.5f) +
+                   dady * (setup->vmin[0][1] - 0.5f)));
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial, though Y has to be inverted for OpenGL.
+ * Z and W are copied from posCoef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coeff(struct setup_context *setup, uint slot)
+{
+   /*X*/
+   setup->coef[slot].a0[0] = 0;
+   setup->coef[slot].dadx[0] = 1.0;
+   setup->coef[slot].dady[0] = 0.0;
+   /*Y*/
+   if (setup->softpipe->rasterizer->origin_lower_left) {
+      /* y=0=bottom */
+      const int winHeight = setup->softpipe->framebuffer.height;
+      setup->coef[slot].a0[1] = (float) (winHeight - 1);
+      setup->coef[slot].dady[1] = -1.0;
+   }
+   else {
+      /* y=0=top */
+      setup->coef[slot].a0[1] = 0.0;
+      setup->coef[slot].dady[1] = 1.0;
+   }
+   setup->coef[slot].dadx[1] = 0.0;
+   /*Z*/
+   setup->coef[slot].a0[2] = setup->posCoef.a0[2];
+   setup->coef[slot].dadx[2] = setup->posCoef.dadx[2];
+   setup->coef[slot].dady[2] = setup->posCoef.dady[2];
+   /*W*/
+   setup->coef[slot].a0[3] = setup->posCoef.a0[3];
+   setup->coef[slot].dadx[3] = setup->posCoef.dadx[3];
+   setup->coef[slot].dady[3] = setup->posCoef.dady[3];
+}
+
+
+
+/**
+ * Compute the setup->coef[] array dadx, dady, a0 values.
+ * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
+ */
+static void setup_tri_coefficients( struct setup_context *setup )
+{
+   struct softpipe_context *softpipe = setup->softpipe;
+   const struct sp_fragment_shader *spfs = softpipe->fs;
+   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   uint fragSlot;
+
+   /* z and w are done by linear interpolation:
+    */
+   tri_linear_coeff(setup, &setup->posCoef, 0, 2);
+   tri_linear_coeff(setup, &setup->posCoef, 0, 3);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      uint j;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_LINEAR:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            tri_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_PERSPECTIVE:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            tri_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
+         /* FOG.y = front/back facing  XXX fix this */
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].dadx[1] = 0.0;
+         setup->coef[fragSlot].dady[1] = 0.0;
+      }
+   }
+}
+
+
+
+static void setup_tri_edges( struct setup_context *setup )
+{
+   float vmin_x = setup->vmin[0][0] + 0.5f;
+   float vmid_x = setup->vmid[0][0] + 0.5f;
+
+   float vmin_y = setup->vmin[0][1] - 0.5f;
+   float vmid_y = setup->vmid[0][1] - 0.5f;
+   float vmax_y = setup->vmax[0][1] - 0.5f;
+
+   setup->emaj.sy = ceilf(vmin_y);
+   setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
+   setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
+   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
+
+   setup->etop.sy = ceilf(vmid_y);
+   setup->etop.lines = (int) ceilf(vmax_y - setup->etop.sy);
+   setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
+   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
+
+   setup->ebot.sy = ceilf(vmin_y);
+   setup->ebot.lines = (int) ceilf(vmid_y - setup->ebot.sy);
+   setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
+   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
+}
+
+
+/**
+ * Render the upper or lower half of a triangle.
+ * Scissoring/cliprect is applied here too.
+ */
+static void subtriangle( struct setup_context *setup,
+			 struct edge *eleft,
+			 struct edge *eright,
+			 unsigned lines )
+{
+   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+   int y, start_y, finish_y;
+   int sy = (int)eleft->sy;
+
+   assert((int)eleft->sy == (int) eright->sy);
+
+   /* clip top/bottom */
+   start_y = sy;
+   finish_y = sy + lines;
+
+   if (start_y < miny)
+      start_y = miny;
+
+   if (finish_y > maxy)
+      finish_y = maxy;
+
+   start_y -= sy;
+   finish_y -= sy;
+
+   /*
+   debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
+   */
+
+   for (y = start_y; y < finish_y; y++) {
+
+      /* avoid accumulating adds as floats don't have the precision to
+       * accurately iterate large triangle edges that way.  luckily we
+       * can just multiply these days.
+       *
+       * this is all drowned out by the attribute interpolation anyway.
+       */
+      int left = (int)(eleft->sx + y * eleft->dxdy);
+      int right = (int)(eright->sx + y * eright->dxdy);
+
+      /* clip left/right */
+      if (left < minx)
+         left = minx;
+      if (right > maxx)
+         right = maxx;
+
+      if (left < right) {
+         int _y = sy + y;
+         if (block(_y) != setup->span.y) {
+            flush_spans(setup);
+            setup->span.y = block(_y);
+         }
+
+         setup->span.left[_y&1] = left;
+         setup->span.right[_y&1] = right;
+         setup->span.y_flags |= 1<<(_y&1);
+      }
+   }
+
+
+   /* save the values so that emaj can be restarted:
+    */
+   eleft->sx += lines * eleft->dxdy;
+   eright->sx += lines * eright->dxdy;
+   eleft->sy += lines;
+   eright->sy += lines;
+}
+
+
+/**
+ * Recalculate prim's determinant.  This is needed as we don't have
+ * get this information through the vbuf_render interface & we must
+ * calculate it here.
+ */
+static float
+calc_det( const float (*v0)[4],
+          const float (*v1)[4],
+          const float (*v2)[4] )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0][0] - v2[0][0];
+   const float ey = v0[0][1] - v2[0][1];
+   const float fx = v1[0][0] - v2[0][0];
+   const float fy = v1[0][1] - v2[0][1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
+
+/**
+ * Do setup for triangle rasterization, then render the triangle.
+ */
+void setup_tri( struct setup_context *setup,
+                const float (*v0)[4],
+                const float (*v1)[4],
+                const float (*v2)[4] )
+{
+   float det;
+
+#if DEBUG_VERTS
+   debug_printf("Setup triangle:\n");
+   print_vertex(setup, v0);
+   print_vertex(setup, v1);
+   print_vertex(setup, v2);
+#endif
+
+   if (setup->softpipe->no_rast)
+      return;
+   
+   det = calc_det(v0, v1, v2);
+   /*
+   debug_printf("%s\n", __FUNCTION__ );
+   */
+
+#if DEBUG_FRAGS
+   setup->numFragsEmitted = 0;
+   setup->numFragsWritten = 0;
+#endif
+
+   if (cull_tri( setup, det ))
+      return;
+
+   if (!setup_sort_vertices( setup, det, v0, v1, v2 ))
+      return;
+   setup_tri_coefficients( setup );
+   setup_tri_edges( setup );
+
+   setup->quad.input.prim = PRIM_TRI;
+
+   setup->span.y = 0;
+   setup->span.y_flags = 0;
+   setup->span.right[0] = 0;
+   setup->span.right[1] = 0;
+   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+
+   /*   init_constant_attribs( setup ); */
+
+   if (setup->oneoverarea < 0.0) {
+      /* emaj on left:
+       */
+      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
+      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
+   }
+   else {
+      /* emaj on right:
+       */
+      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
+      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+   }
+
+   flush_spans( setup );
+
+   WAIT_FOR_COMPLETION(setup);
+
+#if DEBUG_FRAGS
+   printf("Tri: %u frags emitted, %u written\n",
+          setup->numFragsEmitted,
+          setup->numFragsWritten);
+#endif
+}
+
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a line.
+ */
+static void
+line_linear_coeff(struct setup_context *setup,
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
+{
+   const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+   const float dady = da * setup->emaj.dy * setup->oneoverarea;
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (setup->vmin[vertSlot][i] -
+                  (dadx * (setup->vmin[0][0] - 0.5f) +
+                   dady * (setup->vmin[0][1] - 0.5f)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a line.
+ */
+static void
+line_persp_coeff(struct setup_context *setup,
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
+{
+   /* XXX double-check/verify this arithmetic */
+   const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+   const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+   const float da = a1 - a0;
+   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+   const float dady = da * setup->emaj.dy * setup->oneoverarea;
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (setup->vmin[vertSlot][i] -
+                  (dadx * (setup->vmin[0][0] - 0.5f) +
+                   dady * (setup->vmin[0][1] - 0.5f)));
+}
+
+
+/**
+ * Compute the setup->coef[] array dadx, dady, a0 values.
+ * Must be called after setup->vmin,vmax are initialized.
+ */
+static INLINE boolean
+setup_line_coefficients(struct setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4])
+{
+   struct softpipe_context *softpipe = setup->softpipe;
+   const struct sp_fragment_shader *spfs = softpipe->fs;
+   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   uint fragSlot;
+   float area;
+
+   /* use setup->vmin, vmax to point to vertices */
+   setup->vprovoke = v1;
+   setup->vmin = v0;
+   setup->vmax = v1;
+
+   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
+   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
+
+   /* NOTE: this is not really area but something proportional to it */
+   area = setup->emaj.dx * setup->emaj.dx + setup->emaj.dy * setup->emaj.dy;
+   if (area == 0.0f || is_inf_or_nan(area))
+      return FALSE;
+   setup->oneoverarea = 1.0f / area;
+
+   /* z and w are done by linear interpolation:
+    */
+   line_linear_coeff(setup, &setup->posCoef, 0, 2);
+   line_linear_coeff(setup, &setup->posCoef, 0, 3);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      uint j;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_LINEAR:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            line_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_PERSPECTIVE:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            line_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
+         /* FOG.y = front/back facing  XXX fix this */
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].dadx[1] = 0.0;
+         setup->coef[fragSlot].dady[1] = 0.0;
+      }
+   }
+   return TRUE;
+}
+
+
+/**
+ * Plot a pixel in a line segment.
+ */
+static INLINE void
+plot(struct setup_context *setup, int x, int y)
+{
+   const int iy = y & 1;
+   const int ix = x & 1;
+   const int quadX = x - ix;
+   const int quadY = y - iy;
+   const int mask = (1 << ix) << (2 * iy);
+
+   if (quadX != setup->quad.input.x0 ||
+       quadY != setup->quad.input.y0)
+   {
+      /* flush prev quad, start new quad */
+
+      if (setup->quad.input.x0 != -1)
+         CLIP_EMIT_QUAD(setup);
+
+      setup->quad.input.x0 = quadX;
+      setup->quad.input.y0 = quadY;
+      setup->quad.inout.mask = 0x0;
+   }
+
+   setup->quad.inout.mask |= mask;
+}
+
+
+/**
+ * Do setup for line rasterization, then render the line.
+ * Single-pixel width, no stipple, etc.  We rely on the 'draw' module
+ * to handle stippling and wide lines.
+ */
+void
+setup_line(struct setup_context *setup,
+           const float (*v0)[4],
+           const float (*v1)[4])
+{
+   int x0 = (int) v0[0][0];
+   int x1 = (int) v1[0][0];
+   int y0 = (int) v0[0][1];
+   int y1 = (int) v1[0][1];
+   int dx = x1 - x0;
+   int dy = y1 - y0;
+   int xstep, ystep;
+
+#if DEBUG_VERTS
+   debug_printf("Setup line:\n");
+   print_vertex(setup, v0);
+   print_vertex(setup, v1);
+#endif
+
+   if (setup->softpipe->no_rast)
+      return;
+
+   if (dx == 0 && dy == 0)
+      return;
+
+   if (!setup_line_coefficients(setup, v0, v1))
+      return;
+
+   assert(v0[0][0] < 1.0e9);
+   assert(v0[0][1] < 1.0e9);
+   assert(v1[0][0] < 1.0e9);
+   assert(v1[0][1] < 1.0e9);
+
+   if (dx < 0) {
+      dx = -dx;   /* make positive */
+      xstep = -1;
+   }
+   else {
+      xstep = 1;
+   }
+
+   if (dy < 0) {
+      dy = -dy;   /* make positive */
+      ystep = -1;
+   }
+   else {
+      ystep = 1;
+   }
+
+   assert(dx >= 0);
+   assert(dy >= 0);
+
+   setup->quad.input.x0 = setup->quad.input.y0 = -1;
+   setup->quad.inout.mask = 0x0;
+   setup->quad.input.prim = PRIM_LINE;
+   /* XXX temporary: set coverage to 1.0 so the line appears
+    * if AA mode happens to be enabled.
+    */
+   setup->quad.input.coverage[0] =
+   setup->quad.input.coverage[1] =
+   setup->quad.input.coverage[2] =
+   setup->quad.input.coverage[3] = 1.0;
+
+   if (dx > dy) {
+      /*** X-major line ***/
+      int i;
+      const int errorInc = dy + dy;
+      int error = errorInc - dx;
+      const int errorDec = error - dx;
+
+      for (i = 0; i < dx; i++) {
+         plot(setup, x0, y0);
+
+         x0 += xstep;
+         if (error < 0) {
+            error += errorInc;
+         }
+         else {
+            error += errorDec;
+            y0 += ystep;
+         }
+      }
+   }
+   else {
+      /*** Y-major line ***/
+      int i;
+      const int errorInc = dx + dx;
+      int error = errorInc - dy;
+      const int errorDec = error - dy;
+
+      for (i = 0; i < dy; i++) {
+         plot(setup, x0, y0);
+
+         y0 += ystep;
+         if (error < 0) {
+            error += errorInc;
+         }
+         else {
+            error += errorDec;
+            x0 += xstep;
+         }
+      }
+   }
+
+   /* draw final quad */
+   if (setup->quad.inout.mask) {
+      CLIP_EMIT_QUAD(setup);
+   }
+
+   WAIT_FOR_COMPLETION(setup);
+}
+
+
+static void
+point_persp_coeff(struct setup_context *setup,
+                  const float (*vert)[4],
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
+{
+   assert(i <= 3);
+   coef->dadx[i] = 0.0F;
+   coef->dady[i] = 0.0F;
+   coef->a0[i] = vert[vertSlot][i] * vert[0][3];
+}
+
+
+/**
+ * Do setup for point rasterization, then render the point.
+ * Round or square points...
+ * XXX could optimize a lot for 1-pixel points.
+ */
+void
+setup_point( struct setup_context *setup,
+             const float (*v0)[4] )
+{
+   struct softpipe_context *softpipe = setup->softpipe;
+   const struct sp_fragment_shader *spfs = softpipe->fs;
+   const int sizeAttr = setup->softpipe->psize_slot;
+   const float size
+      = sizeAttr > 0 ? v0[sizeAttr][0]
+      : setup->softpipe->rasterizer->point_size;
+   const float halfSize = 0.5F * size;
+   const boolean round = (boolean) setup->softpipe->rasterizer->point_smooth;
+   const float x = v0[0][0];  /* Note: data[0] is always position */
+   const float y = v0[0][1];
+   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   uint fragSlot;
+
+#if DEBUG_VERTS
+   debug_printf("Setup point:\n");
+   print_vertex(setup, v0);
+#endif
+
+   if (softpipe->no_rast)
+      return;
+
+   /* For points, all interpolants are constant-valued.
+    * However, for point sprites, we'll need to setup texcoords appropriately.
+    * XXX: which coefficients are the texcoords???
+    * We may do point sprites as textured quads...
+    *
+    * KW: We don't know which coefficients are texcoords - ultimately
+    * the choice of what interpolation mode to use for each attribute
+    * should be determined by the fragment program, using
+    * per-attribute declaration statements that include interpolation
+    * mode as a parameter.  So either the fragment program will have
+    * to be adjusted for pointsprite vs normal point behaviour, or
+    * otherwise a special interpolation mode will have to be defined
+    * which matches the required behaviour for point sprites.  But -
+    * the latter is not a feature of normal hardware, and as such
+    * probably should be ruled out on that basis.
+    */
+   setup->vprovoke = v0;
+
+   /* setup Z, W */
+   const_coeff(setup, &setup->posCoef, 0, 2);
+   const_coeff(setup, &setup->posCoef, 0, 3);
+
+   for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      uint j;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         /* fall-through */
+      case INTERP_LINEAR:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_PERSPECTIVE:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            point_persp_coeff(setup, setup->vprovoke,
+                              &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
+         /* FOG.y = front/back facing  XXX fix this */
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].dadx[1] = 0.0;
+         setup->coef[fragSlot].dady[1] = 0.0;
+      }
+   }
+
+   setup->quad.input.prim = PRIM_POINT;
+
+   if (halfSize <= 0.5 && !round) {
+      /* special case for 1-pixel points */
+      const int ix = ((int) x) & 1;
+      const int iy = ((int) y) & 1;
+      setup->quad.input.x0 = (int) x - ix;
+      setup->quad.input.y0 = (int) y - iy;
+      setup->quad.inout.mask = (1 << ix) << (2 * iy);
+      CLIP_EMIT_QUAD(setup);
+   }
+   else {
+      if (round) {
+         /* rounded points */
+         const int ixmin = block((int) (x - halfSize));
+         const int ixmax = block((int) (x + halfSize));
+         const int iymin = block((int) (y - halfSize));
+         const int iymax = block((int) (y + halfSize));
+         const float rmin = halfSize - 0.7071F;  /* 0.7071 = sqrt(2)/2 */
+         const float rmax = halfSize + 0.7071F;
+         const float rmin2 = MAX2(0.0F, rmin * rmin);
+         const float rmax2 = rmax * rmax;
+         const float cscale = 1.0F / (rmax2 - rmin2);
+         int ix, iy;
+
+         for (iy = iymin; iy <= iymax; iy += 2) {
+            for (ix = ixmin; ix <= ixmax; ix += 2) {
+               float dx, dy, dist2, cover;
+
+               setup->quad.inout.mask = 0x0;
+
+               dx = (ix + 0.5f) - x;
+               dy = (iy + 0.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad.input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_TOP_LEFT;
+               }
+
+               dx = (ix + 1.5f) - x;
+               dy = (iy + 0.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad.input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_TOP_RIGHT;
+               }
+
+               dx = (ix + 0.5f) - x;
+               dy = (iy + 1.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad.input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_BOTTOM_LEFT;
+               }
+
+               dx = (ix + 1.5f) - x;
+               dy = (iy + 1.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad.input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_BOTTOM_RIGHT;
+               }
+
+               if (setup->quad.inout.mask) {
+                  setup->quad.input.x0 = ix;
+                  setup->quad.input.y0 = iy;
+                  CLIP_EMIT_QUAD(setup);
+               }
+            }
+         }
+      }
+      else {
+         /* square points */
+         const int xmin = (int) (x + 0.75 - halfSize);
+         const int ymin = (int) (y + 0.25 - halfSize);
+         const int xmax = xmin + (int) size;
+         const int ymax = ymin + (int) size;
+         /* XXX could apply scissor to xmin,ymin,xmax,ymax now */
+         const int ixmin = block(xmin);
+         const int ixmax = block(xmax - 1);
+         const int iymin = block(ymin);
+         const int iymax = block(ymax - 1);
+         int ix, iy;
+
+         /*
+         debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
+         */
+         for (iy = iymin; iy <= iymax; iy += 2) {
+            uint rowMask = 0xf;
+            if (iy < ymin) {
+               /* above the top edge */
+               rowMask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+            }
+            if (iy + 1 >= ymax) {
+               /* below the bottom edge */
+               rowMask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+            }
+
+            for (ix = ixmin; ix <= ixmax; ix += 2) {
+               uint mask = rowMask;
+
+               if (ix < xmin) {
+                  /* fragment is past left edge of point, turn off left bits */
+                  mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+               }
+               if (ix + 1 >= xmax) {
+                  /* past the right edge */
+                  mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+               }
+
+               setup->quad.inout.mask = mask;
+               setup->quad.input.x0 = ix;
+               setup->quad.input.y0 = iy;
+               CLIP_EMIT_QUAD(setup);
+            }
+         }
+      }
+   }
+
+   WAIT_FOR_COMPLETION(setup);
+}
+
+void setup_prepare( struct setup_context *setup )
+{
+   struct softpipe_context *sp = setup->softpipe;
+   unsigned i;
+
+   if (sp->dirty) {
+      softpipe_update_derived(sp);
+   }
+
+   /* Mark surfaces as defined now */
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++){
+      if (sp->framebuffer.cbufs[i]) {
+         sp->framebuffer.cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+      }
+   }
+   if (sp->framebuffer.zsbuf) {
+      sp->framebuffer.zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+   }
+
+   /* Note: nr_attrs is only used for debugging (vertex printing) */
+   setup->quad.nr_attrs = draw_num_vs_outputs(sp->draw);
+
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      sp->quad[i].first->begin( sp->quad[i].first );
+   }
+
+   if (sp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
+       sp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
+       sp->rasterizer->fill_ccw == PIPE_POLYGON_MODE_FILL) {
+      /* we'll do culling */
+      setup->winding = sp->rasterizer->cull_mode;
+   }
+   else {
+      /* 'draw' will do culling */
+      setup->winding = PIPE_WINDING_NONE;
+   }
+}
+
+
+
+void setup_destroy_context( struct setup_context *setup )
+{
+   FREE( setup );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.
+ */
+struct setup_context *setup_create_context( struct softpipe_context *softpipe )
+{
+   struct setup_context *setup = CALLOC_STRUCT(setup_context);
+#if SP_NUM_QUAD_THREADS > 1
+   uint i;
+#endif
+
+   setup->softpipe = softpipe;
+
+   setup->quad.coef = setup->coef;
+   setup->quad.posCoef = &setup->posCoef;
+
+#if SP_NUM_QUAD_THREADS > 1
+   setup->que.first = 0;
+   setup->que.last = 0;
+   pipe_mutex_init( setup->que.que_mutex );
+   pipe_condvar_init( setup->que.que_notfull_condvar );
+   pipe_condvar_init( setup->que.que_notempty_condvar );
+   setup->que.jobs_added = 0;
+   setup->que.jobs_done = 0;
+   pipe_condvar_init( setup->que.que_done_condvar );
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      setup->threads[i].setup = setup;
+      setup->threads[i].id = i;
+      setup->threads[i].handle = pipe_thread_create( quad_thread, &setup->threads[i] );
+   }
+#endif
+
+   return setup;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
new file mode 100644
index 0000000000..d54f334428
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -0,0 +1,53 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef SP_SETUP_H
+#define SP_SETUP_H
+
+struct setup_context;
+struct softpipe_context;
+
+void 
+setup_tri( struct setup_context *setup,
+	   const float (*v0)[4],
+	   const float (*v1)[4],
+	   const float (*v2)[4] );
+
+void
+setup_line(struct setup_context *setup,
+           const float (*v0)[4],
+           const float (*v1)[4]);
+
+void
+setup_point( struct setup_context *setup,
+             const float (*v0)[4] );
+
+
+struct setup_context *setup_create_context( struct softpipe_context *softpipe );
+void setup_prepare( struct setup_context *setup );
+void setup_destroy_context( struct setup_context *setup );
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
new file mode 100644
index 0000000000..3eff41ffa5
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -0,0 +1,206 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_STATE_H
+#define SP_STATE_H
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+
+#define SP_NEW_VIEWPORT      0x1
+#define SP_NEW_RASTERIZER    0x2
+#define SP_NEW_FS            0x4
+#define SP_NEW_BLEND         0x8
+#define SP_NEW_CLIP          0x10
+#define SP_NEW_SCISSOR       0x20
+#define SP_NEW_STIPPLE       0x40
+#define SP_NEW_FRAMEBUFFER   0x80
+#define SP_NEW_DEPTH_STENCIL_ALPHA 0x100
+#define SP_NEW_CONSTANTS     0x200
+#define SP_NEW_SAMPLER       0x400
+#define SP_NEW_TEXTURE       0x800
+#define SP_NEW_VERTEX        0x1000
+#define SP_NEW_VS            0x2000
+#define SP_NEW_QUERY         0x4000
+
+
+struct tgsi_sampler;
+struct tgsi_exec_machine;
+struct vertex_info;
+
+
+/**
+ * Subclass of pipe_shader_state (though it doesn't really need to be).
+ *
+ * This is starting to look an awful lot like a quad pipeline stage...
+ */
+struct sp_fragment_shader {
+   struct pipe_shader_state shader;
+
+   struct tgsi_shader_info info;
+
+   void (*prepare)( const struct sp_fragment_shader *shader,
+		    struct tgsi_exec_machine *machine,
+		    struct tgsi_sampler **samplers);
+
+   /* Run the shader - this interface will get cleaned up in the
+    * future:
+    */
+   unsigned (*run)( const struct sp_fragment_shader *shader,
+		    struct tgsi_exec_machine *machine,
+		    struct quad_header *quad );
+
+
+   void (*delete)( struct sp_fragment_shader * );
+};
+
+
+/** Subclass of pipe_shader_state */
+struct sp_vertex_shader {
+   struct pipe_shader_state shader;  /* Note: this field not actually used */
+   struct draw_vertex_shader *draw_data;
+};
+
+
+
+void *
+softpipe_create_blend_state(struct pipe_context *,
+                            const struct pipe_blend_state *);
+void softpipe_bind_blend_state(struct pipe_context *,
+                               void *);
+void softpipe_delete_blend_state(struct pipe_context *,
+                                 void *);
+
+void *
+softpipe_create_sampler_state(struct pipe_context *,
+                              const struct pipe_sampler_state *);
+void softpipe_bind_sampler_states(struct pipe_context *, unsigned, void **);
+void softpipe_delete_sampler_state(struct pipe_context *, void *);
+
+void *
+softpipe_create_depth_stencil_state(struct pipe_context *,
+                                    const struct pipe_depth_stencil_alpha_state *);
+void softpipe_bind_depth_stencil_state(struct pipe_context *, void *);
+void softpipe_delete_depth_stencil_state(struct pipe_context *, void *);
+
+void *
+softpipe_create_rasterizer_state(struct pipe_context *,
+                                 const struct pipe_rasterizer_state *);
+void softpipe_bind_rasterizer_state(struct pipe_context *, void *);
+void softpipe_delete_rasterizer_state(struct pipe_context *, void *);
+
+void softpipe_set_framebuffer_state( struct pipe_context *,
+			     const struct pipe_framebuffer_state * );
+
+void softpipe_set_blend_color( struct pipe_context *pipe,
+                               const struct pipe_blend_color *blend_color );
+
+void softpipe_set_clip_state( struct pipe_context *,
+			     const struct pipe_clip_state * );
+
+void softpipe_set_constant_buffer(struct pipe_context *,
+                                  uint shader, uint index,
+                                  const struct pipe_constant_buffer *buf);
+
+void *softpipe_create_fs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void softpipe_bind_fs_state(struct pipe_context *, void *);
+void softpipe_delete_fs_state(struct pipe_context *, void *);
+void *softpipe_create_vs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void softpipe_bind_vs_state(struct pipe_context *, void *);
+void softpipe_delete_vs_state(struct pipe_context *, void *);
+
+void softpipe_set_polygon_stipple( struct pipe_context *,
+				  const struct pipe_poly_stipple * );
+
+void softpipe_set_scissor_state( struct pipe_context *,
+                                 const struct pipe_scissor_state * );
+
+void softpipe_set_sampler_textures( struct pipe_context *,
+                                    unsigned num,
+                                    struct pipe_texture ** );
+
+void softpipe_set_viewport_state( struct pipe_context *,
+                                  const struct pipe_viewport_state * );
+
+void softpipe_set_vertex_elements(struct pipe_context *,
+                                  unsigned count,
+                                  const struct pipe_vertex_element *);
+
+void softpipe_set_vertex_buffers(struct pipe_context *,
+                                 unsigned count,
+                                 const struct pipe_vertex_buffer *);
+
+
+void softpipe_update_derived( struct softpipe_context *softpipe );
+
+
+boolean softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+			     unsigned start, unsigned count);
+
+boolean softpipe_draw_elements(struct pipe_context *pipe,
+			       struct pipe_buffer *indexBuffer,
+			       unsigned indexSize,
+			       unsigned mode, unsigned start, unsigned count);
+boolean
+softpipe_draw_range_elements(struct pipe_context *pipe,
+                             struct pipe_buffer *indexBuffer,
+                             unsigned indexSize,
+                             unsigned min_index,
+                             unsigned max_index,
+                             unsigned mode, unsigned start, unsigned count);
+
+void
+softpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags);
+
+
+void
+softpipe_map_surfaces(struct softpipe_context *sp);
+
+void
+softpipe_unmap_surfaces(struct softpipe_context *sp);
+
+void
+softpipe_map_texture_surfaces(struct softpipe_context *sp);
+
+void
+softpipe_unmap_texture_surfaces(struct softpipe_context *sp);
+
+
+struct vertex_info *
+softpipe_get_vertex_info(struct softpipe_context *softpipe);
+
+struct vertex_info *
+softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe);
+
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
new file mode 100644
index 0000000000..384fe559af
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -0,0 +1,98 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_state.h"
+
+
+void *
+softpipe_create_blend_state(struct pipe_context *pipe,
+                            const struct pipe_blend_state *blend)
+{
+   return mem_dup(blend, sizeof(*blend));
+}
+
+void softpipe_bind_blend_state( struct pipe_context *pipe,
+                                void *blend )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->blend = (const struct pipe_blend_state *)blend;
+
+   softpipe->dirty |= SP_NEW_BLEND;
+}
+
+void softpipe_delete_blend_state(struct pipe_context *pipe,
+                                 void *blend)
+{
+   FREE( blend );
+}
+
+
+void softpipe_set_blend_color( struct pipe_context *pipe,
+			     const struct pipe_blend_color *blend_color )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->blend_color = *blend_color;
+
+   softpipe->dirty |= SP_NEW_BLEND;
+}
+
+
+/** XXX move someday?  Or consolidate all these simple state setters
+ * into one file.
+ */
+
+
+void *
+softpipe_create_depth_stencil_state(struct pipe_context *pipe,
+				    const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
+}
+
+void
+softpipe_bind_depth_stencil_state(struct pipe_context *pipe,
+                                  void *depth_stencil)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->depth_stencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+
+   softpipe->dirty |= SP_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+void
+softpipe_delete_depth_stencil_state(struct pipe_context *pipe, void *depth)
+{
+   FREE( depth );
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_clip.c b/src/gallium/drivers/softpipe/sp_state_clip.c
new file mode 100644
index 0000000000..4946c776e3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_clip.c
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+#include "sp_context.h"
+#include "sp_state.h"
+#include "draw/draw_context.h"
+
+
+void softpipe_set_clip_state( struct pipe_context *pipe,
+			     const struct pipe_clip_state *clip )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(softpipe->draw, clip);
+}
+
+
+void softpipe_set_viewport_state( struct pipe_context *pipe,
+                                  const struct pipe_viewport_state *viewport )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(softpipe->draw, viewport);
+
+   softpipe->viewport = *viewport; /* struct copy */
+   softpipe->dirty |= SP_NEW_VIEWPORT;
+}
+
+
+void softpipe_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   draw_flush(softpipe->draw);
+
+   softpipe->scissor = *scissor; /* struct copy */
+   softpipe->dirty |= SP_NEW_SCISSOR;
+}
+
+
+void softpipe_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   draw_flush(softpipe->draw);
+
+   softpipe->poly_stipple = *stipple; /* struct copy */
+   softpipe->dirty |= SP_NEW_STIPPLE;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
new file mode 100644
index 0000000000..6b6a4c3ff3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -0,0 +1,210 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_private.h"
+#include "sp_context.h"
+#include "sp_state.h"
+
+
+/**
+ * Mark the current vertex layout as "invalid".
+ * We'll validate the vertex layout later, when we start to actually
+ * render a point or line or tri.
+ */
+static void
+invalidate_vertex_layout(struct softpipe_context *softpipe)
+{
+   softpipe->vertex_info.num_attribs =  0;
+}
+
+
+/**
+ * The vertex info describes how to convert the post-transformed vertices
+ * (simple float[][4]) used by the 'draw' module into vertices for
+ * rasterization.
+ *
+ * This function validates the vertex layout and returns a pointer to a
+ * vertex_info object.
+ */
+struct vertex_info *
+softpipe_get_vertex_info(struct softpipe_context *softpipe)
+{
+   struct vertex_info *vinfo = &softpipe->vertex_info;
+
+   if (vinfo->num_attribs == 0) {
+      /* compute vertex layout now */
+      const struct sp_fragment_shader *spfs = softpipe->fs;
+      const enum interp_mode colorInterp
+         = softpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      uint i;
+
+      if (softpipe->vbuf) {
+         /* if using the post-transform vertex buffer, tell draw_vbuf to
+          * simply emit the whole post-xform vertex as-is:
+          */
+         struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
+         const uint num = draw_num_vs_outputs(softpipe->draw);
+         uint i;
+
+         /* No longer any need to try and emit draw vertex_header info.
+          */
+         vinfo_vbuf->num_attribs = 0;
+         for (i = 0; i < num; i++) {
+            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
+         }
+         draw_compute_vertex_size(vinfo_vbuf);
+      }
+
+      /*
+       * Loop over fragment shader inputs, searching for the matching output
+       * from the vertex shader.
+       */
+      vinfo->num_attribs = 0;
+      for (i = 0; i < spfs->info.num_inputs; i++) {
+         int src;
+         switch (spfs->info.input_semantic_name[i]) {
+         case TGSI_SEMANTIC_POSITION:
+            src = draw_find_vs_output(softpipe->draw,
+                                      TGSI_SEMANTIC_POSITION, 0);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
+            break;
+
+         case TGSI_SEMANTIC_COLOR:
+            src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_COLOR, 
+                                 spfs->info.input_semantic_index[i]);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+            break;
+
+         case TGSI_SEMANTIC_FOG:
+            src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_FOG, 0);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            break;
+
+         case TGSI_SEMANTIC_GENERIC:
+            /* this includes texcoords and varying vars */
+            src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_GENERIC,
+                                      spfs->info.input_semantic_index[i]);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            break;
+
+         default:
+            assert(0);
+         }
+      }
+
+      softpipe->psize_slot = draw_find_vs_output(softpipe->draw,
+                                                 TGSI_SEMANTIC_PSIZE, 0);
+      if (softpipe->psize_slot > 0) {
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
+                               softpipe->psize_slot);
+      }
+
+      draw_compute_vertex_size(vinfo);
+   }
+
+   return vinfo;
+}
+
+
+/**
+ * Called from vbuf module.
+ *
+ * Note that there's actually two different vertex layouts in softpipe.
+ *
+ * The normal one is computed in softpipe_get_vertex_info() above and is
+ * used by the point/line/tri "setup" code.
+ *
+ * The other one (this one) is only used by the vbuf module (which is
+ * not normally used by default but used in testing).  For the vbuf module,
+ * we basically want to pass-through the draw module's vertex layout as-is.
+ * When the softpipe vbuf code begins drawing, the normal vertex layout
+ * will come into play again.
+ */
+struct vertex_info *
+softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
+{
+   (void) softpipe_get_vertex_info(softpipe);
+   return &softpipe->vertex_info_vbuf;
+}
+
+
+/**
+ * Recompute cliprect from scissor bounds, scissor enable and surface size.
+ */
+static void
+compute_cliprect(struct softpipe_context *sp)
+{
+   uint surfWidth = sp->framebuffer.width;
+   uint surfHeight = sp->framebuffer.height;
+
+   if (sp->rasterizer->scissor) {
+      /* clip to scissor rect */
+      sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
+      sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
+      sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
+      sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight);
+   }
+   else {
+      /* clip to surface bounds */
+      sp->cliprect.minx = 0;
+      sp->cliprect.miny = 0;
+      sp->cliprect.maxx = surfWidth;
+      sp->cliprect.maxy = surfHeight;
+   }
+}
+
+
+/* Hopefully this will remain quite simple, otherwise need to pull in
+ * something like the state tracker mechanism.
+ */
+void softpipe_update_derived( struct softpipe_context *softpipe )
+{
+   if (softpipe->dirty & (SP_NEW_RASTERIZER |
+                          SP_NEW_FS |
+                          SP_NEW_VS))
+      invalidate_vertex_layout( softpipe );
+
+   if (softpipe->dirty & (SP_NEW_SCISSOR |
+                          SP_NEW_DEPTH_STENCIL_ALPHA |
+                          SP_NEW_FRAMEBUFFER))
+      compute_cliprect(softpipe);
+
+   if (softpipe->dirty & (SP_NEW_BLEND |
+                          SP_NEW_DEPTH_STENCIL_ALPHA |
+                          SP_NEW_FRAMEBUFFER |
+                          SP_NEW_RASTERIZER |
+                          SP_NEW_FS | 
+			  SP_NEW_QUERY))
+      sp_build_quad_pipeline(softpipe);
+
+   softpipe->dirty = 0;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
new file mode 100644
index 0000000000..4d01a9dbe1
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -0,0 +1,160 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+
+
+void *
+softpipe_create_fs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_fragment_shader *state;
+
+   /* debug */
+   if (softpipe->dump_fs) 
+      tgsi_dump(templ->tokens, 0);
+
+   /* codegen */
+   state = softpipe_create_fs_llvm( softpipe, templ );
+   if (!state) {
+      state = softpipe_create_fs_sse( softpipe, templ );
+      if (!state) {
+         state = softpipe_create_fs_exec( softpipe, templ );
+      }
+   }
+
+   assert(state);
+
+   /* get/save the summary info for this shader */
+   tgsi_scan_shader(templ->tokens, &state->info);
+
+   return state;
+}
+
+
+void
+softpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->fs = (struct sp_fragment_shader *) fs;
+
+   softpipe->dirty |= SP_NEW_FS;
+}
+
+
+void
+softpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct sp_fragment_shader *state = fs;
+
+   assert(fs != softpipe_context(pipe)->fs);
+   
+   state->delete( state );
+}
+
+
+void *
+softpipe_create_vs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_vertex_shader *state;
+
+   state = CALLOC_STRUCT(sp_vertex_shader);
+   if (state == NULL ) {
+      return NULL;
+   }
+
+   state->draw_data = draw_create_vertex_shader(softpipe->draw, templ);
+   if (state->draw_data == NULL) {
+      FREE( state );
+      return NULL;
+   }
+
+   return state;
+}
+
+
+void
+softpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->vs = (const struct sp_vertex_shader *)vs;
+
+   draw_bind_vertex_shader(softpipe->draw,
+                           (softpipe->vs ? softpipe->vs->draw_data : NULL));
+
+   softpipe->dirty |= SP_NEW_VS;
+}
+
+
+void
+softpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   struct sp_vertex_shader *state =
+      (struct sp_vertex_shader *)vs;
+
+   draw_delete_vertex_shader(softpipe->draw, state->draw_data);
+   FREE( state );
+}
+
+
+
+void
+softpipe_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct pipe_screen *screen = pipe->screen;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   /* note: reference counting */
+   pipe_buffer_reference(screen,
+			 &softpipe->constants[shader].buffer,
+			 buf ? buf->buffer : NULL);
+
+   softpipe->dirty |= SP_NEW_CONSTANTS;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
new file mode 100644
index 0000000000..87b7219683
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_state.h"
+#include "draw/draw_context.h"
+
+
+
+void *
+softpipe_create_rasterizer_state(struct pipe_context *pipe,
+                                 const struct pipe_rasterizer_state *rast)
+{
+   return mem_dup(rast, sizeof(*rast));
+}
+
+void softpipe_bind_rasterizer_state(struct pipe_context *pipe,
+                                    void *setup)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(softpipe->draw, setup);
+
+   softpipe->rasterizer = (struct pipe_rasterizer_state *)setup;
+
+   softpipe->dirty |= SP_NEW_RASTERIZER;
+}
+
+void softpipe_delete_rasterizer_state(struct pipe_context *pipe,
+                                      void *rasterizer)
+{
+   FREE( rasterizer );
+}
+
+
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
new file mode 100644
index 0000000000..99a28c0d7e
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -0,0 +1,118 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *  Brian Paul
+ */
+
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "sp_context.h"
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tile_cache.h"
+#include "draw/draw_context.h"
+
+
+
+void *
+softpipe_create_sampler_state(struct pipe_context *pipe,
+                              const struct pipe_sampler_state *sampler)
+{
+   return mem_dup(sampler, sizeof(*sampler));
+}
+
+
+void
+softpipe_bind_sampler_states(struct pipe_context *pipe,
+                             unsigned num, void **sampler)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == softpipe->num_samplers &&
+       !memcmp(softpipe->sampler, sampler, num * sizeof(void *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < num; ++i)
+      softpipe->sampler[i] = sampler[i];
+   for (i = num; i < PIPE_MAX_SAMPLERS; ++i)
+      softpipe->sampler[i] = NULL;
+
+   softpipe->num_samplers = num;
+
+   softpipe->dirty |= SP_NEW_SAMPLER;
+}
+
+
+void
+softpipe_set_sampler_textures(struct pipe_context *pipe,
+                              unsigned num, struct pipe_texture **texture)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == softpipe->num_textures &&
+       !memcmp(softpipe->texture, texture, num * sizeof(struct pipe_texture *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct pipe_texture *tex = i < num ? texture[i] : NULL;
+
+      pipe_texture_reference(&softpipe->texture[i], tex);
+      sp_tile_cache_set_texture(pipe, softpipe->tex_cache[i], tex);
+   }
+
+   softpipe->num_textures = num;
+
+   softpipe->dirty |= SP_NEW_TEXTURE;
+}
+
+
+void
+softpipe_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   FREE( sampler );
+}
+
+
+
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
new file mode 100644
index 0000000000..1493c65884
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -0,0 +1,130 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+#include "pipe/p_inlines.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_surface.h"
+#include "sp_tile_cache.h"
+
+#include "draw/draw_context.h"
+
+
+/**
+ * XXX this might get moved someday
+ * Set the framebuffer surface info: color buffers, zbuffer, stencil buffer.
+ * Here, we flush the old surfaces and update the tile cache to point to the new
+ * surfaces.
+ */
+void
+softpipe_set_framebuffer_state(struct pipe_context *pipe,
+                               const struct pipe_framebuffer_state *fb)
+{
+   struct softpipe_context *sp = softpipe_context(pipe);
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      /* check if changing cbuf */
+      if (sp->framebuffer.cbufs[i] != fb->cbufs[i]) {
+         /* flush old */
+         sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
+
+         /* assign new */
+         sp->framebuffer.cbufs[i] = fb->cbufs[i];
+
+         /* update cache */
+         sp_tile_cache_set_surface(sp->cbuf_cache[i], fb->cbufs[i]);
+      }
+   }
+
+   sp->framebuffer.nr_cbufs = fb->nr_cbufs;
+
+   /* zbuf changing? */
+   if (sp->framebuffer.zsbuf != fb->zsbuf) {
+      /* flush old */
+      sp_flush_tile_cache(sp, sp->zsbuf_cache);
+
+      /* assign new */
+      sp->framebuffer.zsbuf = fb->zsbuf;
+
+      /* update cache */
+      sp_tile_cache_set_surface(sp->zsbuf_cache, fb->zsbuf);
+   }
+
+#if 0
+   /* XXX combined depth/stencil here */
+
+   /* sbuf changing? */
+   if (sp->framebuffer.sbuf != fb->sbuf) {
+      /* flush old */
+      sp_flush_tile_cache(sp, sp->sbuf_cache_sep);
+
+      /* assign new */
+      sp->framebuffer.sbuf = fb->sbuf;
+
+      /* update cache */
+      if (fb->sbuf != fb->zbuf) {
+         /* separate stencil buf */
+         sp->sbuf_cache = sp->sbuf_cache_sep;
+         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
+      }
+      else {
+         /* combined depth/stencil */
+         sp->sbuf_cache = sp->zbuf_cache;
+         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
+      }
+   }
+#endif
+
+   /* Tell draw module how deep the Z/depth buffer is */
+   {
+      int depth_bits;
+      double mrd;
+      if (sp->framebuffer.zsbuf) {
+         depth_bits = pf_get_component_bits(sp->framebuffer.zsbuf->format,
+                                            PIPE_FORMAT_COMP_Z);
+      }
+      else {
+         depth_bits = 0;
+      }
+      if (depth_bits > 16) {
+         mrd = 0.0000001;
+      }
+      else {
+         mrd = 0.00002;
+      }
+      draw_set_mrd(sp->draw, mrd);
+   }
+
+   sp->framebuffer.width = fb->width;
+   sp->framebuffer.height = fb->height;
+
+   sp->dirty |= SP_NEW_FRAMEBUFFER;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_vertex.c b/src/gallium/drivers/softpipe/sp_state_vertex.c
new file mode 100644
index 0000000000..46b6991195
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_vertex.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_surface.h"
+
+#include "draw/draw_context.h"
+
+
+void
+softpipe_set_vertex_elements(struct pipe_context *pipe,
+                             unsigned count,
+                             const struct pipe_vertex_element *attribs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(softpipe->vertex_element, attribs,
+          count * sizeof(struct pipe_vertex_element));
+   softpipe->num_vertex_elements = count;
+
+   softpipe->dirty |= SP_NEW_VERTEX;
+
+   draw_set_vertex_elements(softpipe->draw, count, attribs);
+}
+
+
+void
+softpipe_set_vertex_buffers(struct pipe_context *pipe,
+                            unsigned count,
+                            const struct pipe_vertex_buffer *buffers)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(softpipe->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   softpipe->num_vertex_buffers = count;
+
+   softpipe->dirty |= SP_NEW_VERTEX;
+
+   draw_set_vertex_buffers(softpipe->draw, count, buffers);
+}
diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c
new file mode 100644
index 0000000000..6ade732698
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_rect.h"
+#include "sp_context.h"
+
+
+
+void
+sp_init_surface_functions(struct softpipe_context *sp)
+{
+   sp->pipe.surface_copy = util_surface_copy;
+   sp->pipe.surface_fill = util_surface_fill;
+}
diff --git a/src/gallium/drivers/softpipe/sp_surface.h b/src/gallium/drivers/softpipe/sp_surface.h
new file mode 100644
index 0000000000..22de3ba43f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_SURFACE_H
+#define SP_SURFACE_H
+
+
+struct softpipe_context;
+
+
+extern void
+sp_init_surface_functions(struct softpipe_context *sp);
+
+
+#endif /* SP_SURFACE_H */
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
new file mode 100644
index 0000000000..32aa5025e4
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -0,0 +1,1229 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling
+ *
+ * Authors:
+ *   Brian Paul
+ */
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_texture.h"
+#include "sp_tex_sample.h"
+#include "sp_tile_cache.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+
+/*
+ * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
+ * see 1-pixel bands of improperly weighted linear-filtered textures.
+ * The tests/texwrap.c demo is a good test.
+ * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
+ * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
+ */
+#define FRAC(f)  ((f) - util_ifloor(f))
+
+
+/**
+ * Linear interpolation macro
+ */
+static INLINE float
+lerp(float a, float v0, float v1)
+{
+   return v0 + a * (v1 - v0);
+}
+
+
+/**
+ * Do 2D/biliner interpolation of float values.
+ * v00, v10, v01 and v11 are typically four texture samples in a square/box.
+ * a and b are the horizontal and vertical interpolants.
+ * It's important that this function is inlined when compiled with
+ * optimization!  If we find that's not true on some systems, convert
+ * to a macro.
+ */
+static INLINE float
+lerp_2d(float a, float b,
+        float v00, float v10, float v01, float v11)
+{
+   const float temp0 = lerp(a, v00, v10);
+   const float temp1 = lerp(a, v01, v11);
+   return lerp(b, temp0, temp1);
+}
+
+
+/**
+ * As above, but 3D interpolation of 8 values.
+ */
+static INLINE float
+lerp_3d(float a, float b, float c,
+        float v000, float v100, float v010, float v110,
+        float v001, float v101, float v011, float v111)
+{
+   const float temp0 = lerp_2d(a, b, v000, v100, v010, v110);
+   const float temp1 = lerp_2d(a, b, v001, v101, v011, v111);
+   return lerp(c, temp0, temp1);
+}
+
+
+
+/**
+ * If A is a signed integer, A % B doesn't give the right value for A < 0
+ * (in terms of texture repeat).  Just casting to unsigned fixes that.
+ */
+#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
+
+
+/**
+ * Apply texture coord wrapping mode and return integer texture indexes
+ * for a vector of four texcoords (S or T or P).
+ * \param wrapMode  PIPE_TEX_WRAP_x
+ * \param s  the incoming texcoords
+ * \param size  the texture image size
+ * \param icoord  returns the integer texcoords
+ * \return  integer texture index
+ */
+static INLINE void
+nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+                   int icoord[4])
+{
+   uint ch;
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      /* s limited to [0,1) */
+      /* i limited to [0,size-1] */
+      for (ch = 0; ch < 4; ch++) {
+         int i = util_ifloor(s[ch] * size);
+         icoord[ch] = REMAINDER(i, size);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP:
+      /* s limited to [0,1] */
+      /* i limited to [0,size-1] */
+      for (ch = 0; ch < 4; ch++) {
+         if (s[ch] <= 0.0F)
+            icoord[ch] = 0;
+         else if (s[ch] >= 1.0F)
+            icoord[ch] = size - 1;
+         else
+            icoord[ch] = util_ifloor(s[ch] * size);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [0, size-1] */
+         const float min = 1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            if (s[ch] < min)
+               icoord[ch] = 0;
+            else if (s[ch] > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(s[ch] * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [-1, size] */
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            if (s[ch] <= min)
+               icoord[ch] = -1;
+            else if (s[ch] >= max)
+               icoord[ch] = size;
+            else
+               icoord[ch] = util_ifloor(s[ch] * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      {
+         const float min = 1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            const int flr = util_ifloor(s[ch]);
+            float u;
+            if (flr & 1)
+               u = 1.0F - (s[ch] - (float) flr);
+            else
+               u = s[ch] - (float) flr;
+            if (u < min)
+               icoord[ch] = 0;
+            else if (u > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         /* s limited to [0,1] */
+         /* i limited to [0,size-1] */
+         const float u = fabsf(s[ch]);
+         if (u <= 0.0F)
+            icoord[ch] = 0;
+         else if (u >= 1.0F)
+            icoord[ch] = size - 1;
+         else
+            icoord[ch] = util_ifloor(u * size);
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [0, size-1] */
+         const float min = 1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            const float u = fabsf(s[ch]);
+            if (u < min)
+               icoord[ch] = 0;
+            else if (u > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [0, size-1] */
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            const float u = fabsf(s[ch]);
+            if (u < min)
+               icoord[ch] = -1;
+            else if (u > max)
+               icoord[ch] = size;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
+      }
+      return;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Used to compute texel locations for linear sampling for four texcoords.
+ * \param wrapMode  PIPE_TEX_WRAP_x
+ * \param s  the texcoords
+ * \param size  the texture image size
+ * \param icoord0  returns first texture indexes
+ * \param icoord1  returns second texture indexes (usually icoord0 + 1)
+ * \param w  returns blend factor/weight between texture indexes
+ * \param icoord  returns the computed integer texture coords
+ */
+static INLINE void
+linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+                  int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      for (ch = 0; ch < 4; ch++) {
+         float u = s[ch] * size - 0.5F;
+         icoord0[ch] = REMAINDER(util_ifloor(u), size);
+         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.0F, 1.0F);
+         u = u * size - 0.5f;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.0F, 1.0F);
+         u = u * size - 0.5f;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      {
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            float u = CLAMP(s[ch], min, max);
+            u = u * size - 0.5f;
+            icoord0[ch] = util_ifloor(u);
+            icoord1[ch] = icoord0[ch] + 1;
+            w[ch] = FRAC(u);
+         }
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      for (ch = 0; ch < 4; ch++) {
+         const int flr = util_ifloor(s[ch]);
+         float u;
+         if (flr & 1)
+            u = 1.0F - (s[ch] - (float) flr);
+         else
+            u = s[ch] - (float) flr;
+         u = u * size - 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         float u = fabsf(s[ch]);
+         if (u >= 1.0F)
+            u = (float) size;
+         else
+            u *= size;
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      for (ch = 0; ch < 4; ch++) {
+         float u = fabsf(s[ch]);
+         if (u >= 1.0F)
+            u = (float) size;
+         else
+            u *= size;
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      {
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            float u = fabsf(s[ch]);
+            if (u <= min)
+               u = min * size;
+            else if (u >= max)
+               u = max * size;
+            else
+               u *= size;
+            u -= 0.5F;
+            icoord0[ch] = util_ifloor(u);
+            icoord1[ch] = icoord0[ch] + 1;
+            w[ch] = FRAC(u);
+         }
+      }
+      break;;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * For RECT textures / unnormalized texcoords
+ * Only a subset of wrap modes supported.
+ */
+static INLINE void
+nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
+                          int icoord[4])
+{
+   uint ch;
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         int i = util_ifloor(s[ch]);
+         icoord[ch]= CLAMP(i, 0, (int) size-1);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      /* fall-through */
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      for (ch = 0; ch < 4; ch++) {
+         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
+      }
+      return;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * For RECT textures / unnormalized texcoords.
+ * Only a subset of wrap modes supported.
+ */
+static INLINE void
+linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
+                         int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         /* Not exactly what the spec says, but it matches NVIDIA output */
+         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      /* fall-through */
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord1[ch] > (int) size - 1)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = ( sc / ma + 1.0F ) * 0.5F;
+   *newT = ( tc / ma + 1.0F ) * 0.5F;
+
+   return face;
+}
+
+
+/**
+ * Examine the quad's texture coordinates to compute the partial
+ * derivatives w.r.t X and Y, then compute lambda (level of detail).
+ *
+ * This is only done for fragment shaders, not vertex shaders.
+ */
+static float
+compute_lambda(const struct pipe_texture *tex,
+               const struct pipe_sampler_state *sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias)
+{
+   float rho, lambda;
+
+   assert(sampler->normalized_coords);
+
+   assert(s);
+   {
+      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
+      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
+      dsdx = fabsf(dsdx);
+      dsdy = fabsf(dsdy);
+      rho = MAX2(dsdx, dsdy) * tex->width[0];
+   }
+   if (t) {
+      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
+      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
+      float max;
+      dtdx = fabsf(dtdx);
+      dtdy = fabsf(dtdy);
+      max = MAX2(dtdx, dtdy) * tex->height[0];
+      rho = MAX2(rho, max);
+   }
+   if (p) {
+      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
+      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
+      float max;
+      dpdx = fabsf(dpdx);
+      dpdy = fabsf(dpdy);
+      max = MAX2(dpdx, dpdy) * tex->depth[0];
+      rho = MAX2(rho, max);
+   }
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+/**
+ * Do several things here:
+ * 1. Compute lambda from the texcoords, if needed
+ * 2. Determine if we're minifying or magnifying
+ * 3. If minifying, choose mipmap levels
+ * 4. Return image filter to use within mipmap images
+ * \param level0  Returns first mipmap level to sample from
+ * \param level1  Returns second mipmap level to sample from
+ * \param levelBlend  Returns blend factor between levels, in [0,1]
+ * \param imgFilter  Returns either the min or mag filter, depending on lambda
+ */
+static void
+choose_mipmap_levels(const struct pipe_texture *texture,
+                     const struct pipe_sampler_state *sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     boolean computeLambda,
+                     float lodbias,
+                     unsigned *level0, unsigned *level1, float *levelBlend,
+                     unsigned *imgFilter)
+{
+   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+      /* no mipmap selection needed */
+      *level0 = *level1 = CLAMP((int) sampler->min_lod,
+                                0, (int) texture->last_level);
+
+      if (sampler->min_img_filter != sampler->mag_img_filter) {
+         /* non-mipmapped texture, but still need to determine if doing
+          * minification or magnification.
+          */
+         float lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
+         if (lambda <= 0.0) {
+            *imgFilter = sampler->mag_img_filter;
+         }
+         else {
+            *imgFilter = sampler->min_img_filter;
+         }
+      }
+      else {
+         *imgFilter = sampler->mag_img_filter;
+      }
+   }
+   else {
+      float lambda;
+
+      if (computeLambda)
+         /* fragment shader */
+         lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
+      else
+         /* vertex shader */
+         lambda = lodbias; /* not really a bias, but absolute LOD */
+
+      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
+         /* magnifying */
+         *imgFilter = sampler->mag_img_filter;
+         *level0 = *level1 = 0;
+      }
+      else {
+         /* minifying */
+         *imgFilter = sampler->min_img_filter;
+
+         /* choose mipmap level(s) and compute the blend factor between them */
+         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+            /* Nearest mipmap level */
+            const int lvl = (int) (lambda + 0.5);
+            *level0 =
+            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
+         }
+         else {
+            /* Linear interpolation between mipmap levels */
+            const int lvl = (int) lambda;
+            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
+            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
+            *levelBlend = FRAC(lambda);  /* blending weight between levels */
+         }
+      }
+   }
+}
+
+
+/**
+ * Get a texel from a texture, using the texture tile cache.
+ *
+ * \param face  the cube face in 0..5
+ * \param level  the mipmap level
+ * \param x  the x coord of texel within 2D image
+ * \param y  the y coord of texel within 2D image
+ * \param z  which slice of a 3D texture
+ * \param rgba  the quad to put the texel/color into
+ * \param j  which element of the rgba quad to write to
+ *
+ * XXX maybe move this into sp_tile_cache.c and merge with the
+ * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
+ */
+static void
+get_texel(const struct tgsi_sampler *tgsi_sampler,
+          unsigned face, unsigned level, int x, int y, int z,
+          float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level] ||
+       z < 0 || z >= (int) texture->depth[level]) {
+      rgba[0][j] = sampler->border_color[0];
+      rgba[1][j] = sampler->border_color[1];
+      rgba[2][j] = sampler->border_color[2];
+      rgba[3][j] = sampler->border_color[3];
+   }
+   else {
+      const int tx = x % TILE_SIZE;
+      const int ty = y % TILE_SIZE;
+      const struct softpipe_cached_tile *tile
+         = sp_get_cached_tile_tex(sp, samp->cache,
+                                  x, y, z, face, level);
+      rgba[0][j] = tile->data.color[ty][tx][0];
+      rgba[1][j] = tile->data.color[ty][tx][1];
+      rgba[2][j] = tile->data.color[ty][tx][2];
+      rgba[3][j] = tile->data.color[ty][tx][3];
+      if (0)
+      {
+         debug_printf("Get texel %f %f %f %f from %s\n",
+                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
+                      pf_name(texture->format));
+      }
+   }
+}
+
+
+/**
+ * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
+ * When we sampled the depth texture, the depth value was put into all
+ * RGBA channels.  We look at the red channel here.
+ */
+static INLINE void
+shadow_compare(uint compare_func,
+               float rgba[NUM_CHANNELS][QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               uint j)
+{
+   int k;
+   switch (compare_func) {
+   case PIPE_FUNC_LESS:
+      k = p[j] < rgba[0][j];
+      break;
+   case PIPE_FUNC_LEQUAL:
+      k = p[j] <= rgba[0][j];
+      break;
+   case PIPE_FUNC_GREATER:
+      k = p[j] > rgba[0][j];
+      break;
+   case PIPE_FUNC_GEQUAL:
+      k = p[j] >= rgba[0][j];
+      break;
+   case PIPE_FUNC_EQUAL:
+      k = p[j] == rgba[0][j];
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      k = p[j] != rgba[0][j];
+      break;
+   case PIPE_FUNC_ALWAYS:
+      k = 1;
+      break;
+   case PIPE_FUNC_NEVER:
+      k = 0;
+      break;
+   default:
+      k = 0;
+      assert(0);
+      break;
+   }
+
+   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
+}
+
+
+/**
+ * Common code for sampling 1D/2D/cube textures.
+ * Could probably extend for 3D...
+ */
+static void
+sp_get_samples_2d_common(const struct tgsi_sampler *tgsi_sampler,
+                         const float s[QUAD_SIZE],
+                         const float t[QUAD_SIZE],
+                         const float p[QUAD_SIZE],
+                         boolean computeLambda,
+                         float lodbias,
+                         float rgba[NUM_CHANNELS][QUAD_SIZE],
+                         const unsigned faces[4])
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   const uint compare_func = sampler->compare_func;
+   unsigned level0, level1, j, imgFilter;
+   int width, height;
+   float levelBlend;
+
+   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
+                        &level0, &level1, &levelBlend, &imgFilter);
+
+   assert(sampler->normalized_coords);
+
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+
+   switch (imgFilter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      {
+         int x[4], y[4];
+         nearest_texcoord_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_4(sampler->wrap_t, t, height, y);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, rgba, p, j);
+            }
+
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               unsigned c;
+               x[j] /= 2;
+               y[j] /= 2;
+               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
+                         rgba2, j);
+               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+                  shadow_compare(compare_func, rgba2, p, j);
+               }
+
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      {
+         int x0[4], y0[4], x1[4], y1[4];
+         float xw[4], yw[4]; /* weights */
+
+         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
+         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            float tx[4][4]; /* texels */
+            int c;
+            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
+            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
+            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
+            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, tx, p, 0);
+               shadow_compare(compare_func, tx, p, 1);
+               shadow_compare(compare_func, tx, p, 2);
+               shadow_compare(compare_func, tx, p, 3);
+            }
+
+            /* interpolate R, G, B, A */
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_2d(xw[j], yw[j],
+                                    tx[c][0], tx[c][1],
+                                    tx[c][2], tx[c][3]);
+            }
+
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               x0[j] /= 2;
+               y0[j] /= 2;
+               x1[j] /= 2;
+               y1[j] /= 2;
+               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
+               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
+               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
+               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
+               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+                  shadow_compare(compare_func, tx, p, 0);
+                  shadow_compare(compare_func, tx, p, 1);
+                  shadow_compare(compare_func, tx, p, 2);
+                  shadow_compare(compare_func, tx, p, 3);
+               }
+
+               /* interpolate R, G, B, A */
+               for (c = 0; c < 4; c++) {
+                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
+                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+               }
+
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static INLINE void
+sp_get_samples_1d(const struct tgsi_sampler *sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  boolean computeLambda,
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   static const unsigned faces[4] = {0, 0, 0, 0};
+   static const float tzero[4] = {0, 0, 0, 0};
+   sp_get_samples_2d_common(sampler, s, tzero, NULL,
+                            computeLambda, lodbias, rgba, faces);
+}
+
+
+static INLINE void
+sp_get_samples_2d(const struct tgsi_sampler *sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  boolean computeLambda,
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   static const unsigned faces[4] = {0, 0, 0, 0};
+   sp_get_samples_2d_common(sampler, s, t, p,
+                            computeLambda, lodbias, rgba, faces);
+}
+
+
+static INLINE void
+sp_get_samples_3d(const struct tgsi_sampler *tgsi_sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  boolean computeLambda,
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   /* get/map pipe_surfaces corresponding to 3D tex slices */
+   unsigned level0, level1, j, imgFilter;
+   int width, height, depth;
+   float levelBlend;
+   const uint face = 0;
+
+   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
+                        &level0, &level1, &levelBlend, &imgFilter);
+
+   assert(sampler->normalized_coords);
+
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   switch (imgFilter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      {
+         int x[4], y[4], z[4];
+         nearest_texcoord_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_4(sampler->wrap_t, t, height, y);
+         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               unsigned c;
+               x[j] /= 2;
+               y[j] /= 2;
+               z[j] /= 2;
+               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      {
+         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
+         float xw[4], yw[4], zw[4]; /* interpolation weights */
+         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
+         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
+         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int c;
+            float tx0[4][4], tx1[4][4];
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
+
+            /* interpolate R, G, B, A */
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                                    tx0[c][0], tx0[c][1],
+                                    tx0[c][2], tx0[c][3],
+                                    tx1[c][0], tx1[c][1],
+                                    tx1[c][2], tx1[c][3]);
+            }
+
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               x0[j] /= 2;
+               y0[j] /= 2;
+               z0[j] /= 2;
+               x1[j] /= 2;
+               y1[j] /= 2;
+               z1[j] /= 2;
+               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
+               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
+               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
+               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
+               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
+               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
+               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
+               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
+
+               /* interpolate R, G, B, A */
+               for (c = 0; c < 4; c++) {
+                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                                        tx0[c][0], tx0[c][1],
+                                        tx0[c][2], tx0[c][3],
+                                        tx1[c][0], tx1[c][1],
+                                        tx1[c][2], tx1[c][3]);
+               }
+
+               /* blend mipmap levels */
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+sp_get_samples_cube(const struct tgsi_sampler *sampler,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    boolean computeLambda,
+                    float lodbias,
+                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   unsigned faces[QUAD_SIZE], j;
+   float ssss[4], tttt[4];
+   for (j = 0; j < QUAD_SIZE; j++) {
+      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
+   }
+   sp_get_samples_2d_common(sampler, ssss, tttt, NULL,
+                            computeLambda, lodbias, rgba, faces);
+}
+
+
+static void
+sp_get_samples_rect(const struct tgsi_sampler *tgsi_sampler,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    boolean computeLambda,
+                    float lodbias,
+                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   const uint face = 0;
+   const uint compare_func = sampler->compare_func;
+   unsigned level0, level1, j, imgFilter;
+   int width, height;
+   float levelBlend;
+
+   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
+                        &level0, &level1, &levelBlend, &imgFilter);
+
+   /* texture RECTS cannot be mipmapped */
+   assert(level0 == level1);
+
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+
+   switch (imgFilter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      {
+         int x[4], y[4];
+         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, rgba, p, j);
+            }
+         }
+      }
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      {
+         int x0[4], y0[4], x1[4], y1[4];
+         float xw[4], yw[4]; /* weights */
+         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
+         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            float tx[4][4]; /* texels */
+            int c;
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, tx, p, 0);
+               shadow_compare(compare_func, tx, p, 1);
+               shadow_compare(compare_func, tx, p, 2);
+               shadow_compare(compare_func, tx, p, 3);
+            }
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_2d(xw[j], yw[j],
+                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Common code for vertex/fragment program texture sampling.
+ */
+static INLINE void
+sp_get_samples(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               boolean computeLambda,
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+
+   if (!texture)
+      return;
+
+   switch (texture->target) {
+   case PIPE_TEXTURE_1D:
+      assert(sampler->normalized_coords);
+      sp_get_samples_1d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
+      break;
+   case PIPE_TEXTURE_2D:
+      if (sampler->normalized_coords)
+         sp_get_samples_2d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
+      else
+         sp_get_samples_rect(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
+      break;
+   case PIPE_TEXTURE_3D:
+      assert(sampler->normalized_coords);
+      sp_get_samples_3d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
+      break;
+   case PIPE_TEXTURE_CUBE:
+      assert(sampler->normalized_coords);
+      sp_get_samples_cube(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
+      break;
+   default:
+      assert(0);
+   }
+
+#if 0 /* DEBUG */
+   {
+      int i;
+      printf("Sampled at %f, %f, %f:\n", s[0], t[0], p[0]);
+      for (i = 0; i < 4; i++) {
+         printf("Frag %d: %f %f %f %f\n", i,
+                rgba[0][i],
+                rgba[1][i],
+                rgba[2][i],
+                rgba[3][i]);
+      }
+   }
+#endif
+}
+
+
+/**
+ * Called via tgsi_sampler::get_samples() when running a fragment shader.
+ * Get four filtered RGBA values from the sampler's texture.
+ */
+void
+sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   sp_get_samples(tgsi_sampler, s, t, p, TRUE, lodbias, rgba);
+}
+
+
+/**
+ * Called via tgsi_sampler::get_samples() when running a vertex shader.
+ * Get four filtered RGBA values from the sampler's texture.
+ */
+void
+sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   sp_get_samples(tgsi_sampler, s, t, p, FALSE, lodbias, rgba);
+}
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
new file mode 100644
index 0000000000..40d8eb2c2a
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEX_SAMPLE_H
+#define SP_TEX_SAMPLE_H
+
+
+#include "tgsi/tgsi_exec.h"
+
+
+/**
+ * Subclass of tgsi_sampler
+ */
+struct sp_shader_sampler
+{
+   struct tgsi_sampler base;  /**< base class */
+
+   uint unit;
+   struct softpipe_context *sp;
+   struct softpipe_tile_cache *cache;
+};
+
+
+
+static INLINE const struct sp_shader_sampler *
+sp_shader_sampler(const struct tgsi_sampler *sampler)
+{
+   return (const struct sp_shader_sampler *) sampler;
+}
+
+
+extern void
+sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+extern void
+sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+#endif /* SP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
new file mode 100644
index 0000000000..3eed0d0d29
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -0,0 +1,348 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tile_cache.h"
+#include "sp_screen.h"
+
+
+/* Simple, maximally packed layout.
+ */
+
+static unsigned minify( unsigned d )
+{
+   return MAX2(1, d>>1);
+}
+
+
+/* Conventional allocation path for non-display textures:
+ */
+static boolean
+softpipe_texture_layout(struct pipe_screen *screen,
+                        struct softpipe_texture * spt)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct pipe_texture *pt = &spt->base;
+   unsigned level;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned depth = pt->depth[0];
+
+   unsigned buffer_size = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      pt->width[level] = width;
+      pt->height[level] = height;
+      pt->depth[level] = depth;
+      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
+      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);  
+      spt->stride[level] = pt->nblocksx[level]*pt->block.size;
+
+      spt->level_offset[level] = buffer_size;
+
+      buffer_size += (pt->nblocksy[level] *
+                      ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
+                      spt->stride[level]);
+
+      width  = minify(width);
+      height = minify(height);
+      depth = minify(depth);
+   }
+
+   spt->buffer = ws->buffer_create(ws, 32,
+                                   PIPE_BUFFER_USAGE_PIXEL,
+                                   buffer_size);
+
+   return spt->buffer != NULL;
+}
+
+static boolean
+softpipe_displaytarget_layout(struct pipe_screen *screen,
+                              struct softpipe_texture * spt)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   unsigned usage = (PIPE_BUFFER_USAGE_CPU_READ_WRITE |
+                     PIPE_BUFFER_USAGE_GPU_READ_WRITE);
+
+   spt->base.nblocksx[0] = pf_get_nblocksx(&spt->base.block, spt->base.width[0]);  
+   spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]);  
+
+   spt->buffer = ws->surface_buffer_create( ws, 
+                                            spt->base.width[0], 
+                                            spt->base.height[0],
+                                            spt->base.format,
+                                            usage,
+                                            &spt->stride[0]);
+
+   return spt->buffer != NULL;
+}
+
+
+
+
+
+static struct pipe_texture *
+softpipe_texture_create(struct pipe_screen *screen,
+                        const struct pipe_texture *templat)
+{
+   struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
+   if (!spt)
+      return NULL;
+
+   spt->base = *templat;
+   spt->base.refcount = 1;
+   spt->base.screen = screen;
+
+   if (spt->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET) {
+      if (!softpipe_displaytarget_layout(screen, spt))
+         goto fail;
+   }
+   else {
+      if (!softpipe_texture_layout(screen, spt))
+         goto fail;
+   }
+    
+   assert(spt->base.refcount == 1);
+   return &spt->base;
+
+ fail:
+   FREE(spt);
+   return NULL;
+}
+
+
+static struct pipe_texture *
+softpipe_texture_blanket(struct pipe_screen * screen,
+                         const struct pipe_texture *base,
+                         const unsigned *stride,
+                         struct pipe_buffer *buffer)
+{
+   struct softpipe_texture *spt;
+   assert(screen);
+
+   /* Only supports one type */
+   if (base->target != PIPE_TEXTURE_2D ||
+       base->last_level != 0 ||
+       base->depth[0] != 1) {
+      return NULL;
+   }
+
+   spt = CALLOC_STRUCT(softpipe_texture);
+   if (!spt)
+      return NULL;
+
+   spt->base = *base;
+   spt->base.refcount = 1;
+   spt->base.screen = screen;
+   spt->base.nblocksx[0] = pf_get_nblocksx(&spt->base.block, spt->base.width[0]);  
+   spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]);  
+   spt->stride[0] = stride[0];
+
+   pipe_buffer_reference(screen, &spt->buffer, buffer);
+
+   return &spt->base;
+}
+
+
+static void
+softpipe_texture_release(struct pipe_screen *screen,
+                         struct pipe_texture **pt)
+{
+   if (!*pt)
+      return;
+
+   if (--(*pt)->refcount <= 0) {
+      struct softpipe_texture *spt = softpipe_texture(*pt);
+
+      pipe_buffer_reference(screen, &spt->buffer, NULL);
+      FREE(spt);
+   }
+   *pt = NULL;
+}
+
+
+static struct pipe_surface *
+softpipe_get_tex_surface(struct pipe_screen *screen,
+                         struct pipe_texture *pt,
+                         unsigned face, unsigned level, unsigned zslice,
+                         unsigned usage)
+{
+   struct softpipe_texture *spt = softpipe_texture(pt);
+   struct pipe_surface *ps;
+
+   assert(level <= pt->last_level);
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      ps->refcount = 1;
+      pipe_texture_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->block = pt->block;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = spt->stride[level];
+      ps->offset = spt->level_offset[level];
+      ps->usage = usage;
+
+      /* Because we are softpipe, anything that the state tracker
+       * thought was going to be done with the GPU will actually get
+       * done with the CPU.  Let's adjust the flags to take that into
+       * account.
+       */
+      if (ps->usage & PIPE_BUFFER_USAGE_GPU_WRITE) {
+         /* GPU_WRITE means "render" and that can involve reads (blending) */
+         ps->usage |= PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_CPU_READ;
+      }
+
+      if (ps->usage & PIPE_BUFFER_USAGE_GPU_READ)
+         ps->usage |= PIPE_BUFFER_USAGE_CPU_READ;
+
+      if (ps->usage & (PIPE_BUFFER_USAGE_CPU_WRITE |
+                       PIPE_BUFFER_USAGE_GPU_WRITE)) {
+         /* Mark the surface as dirty.  The tile cache will look for this. */
+         spt->modified = TRUE;
+      }
+
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+
+      if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
+         ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+            ps->nblocksy *
+            ps->stride;
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+   }
+   return ps;
+}
+
+
+static void 
+softpipe_tex_surface_release(struct pipe_screen *screen, 
+                             struct pipe_surface **s)
+{
+   struct pipe_surface *surf = *s;
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For softpipe, nothing to do.
+    */
+   assert(surf->texture);
+   if (--surf->refcount == 0) {
+      pipe_texture_reference(&surf->texture, NULL);
+      FREE(surf);
+   }
+   *s = NULL;
+}
+
+
+static void *
+softpipe_surface_map( struct pipe_screen *screen,
+                      struct pipe_surface *surface,
+                      unsigned flags )
+{
+   ubyte *map;
+   struct softpipe_texture *spt;
+
+   if (flags & ~surface->usage) {
+      assert(0);
+      return NULL;
+   }
+
+   assert(surface->texture);
+   spt = softpipe_texture(surface->texture);
+   map = pipe_buffer_map(screen, spt->buffer, flags);
+   if (map == NULL)
+      return NULL;
+
+   /* May want to different things here depending on read/write nature
+    * of the map:
+    */
+   if (surface->texture &&
+       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
+   {
+      /* Do something to notify sharing contexts of a texture change.
+       * In softpipe, that would mean flushing the texture cache.
+       */
+      softpipe_screen(screen)->timestamp++;
+   }
+   
+   return map + surface->offset;
+}
+
+
+static void
+softpipe_surface_unmap(struct pipe_screen *screen,
+                       struct pipe_surface *surface)
+{
+   struct softpipe_texture *spt;
+
+   assert(surface->texture);
+   spt = softpipe_texture(surface->texture);
+
+   pipe_buffer_unmap( screen, spt->buffer );
+}
+
+
+void
+softpipe_init_texture_funcs(struct softpipe_context *sp)
+{
+}
+
+
+void
+softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->texture_create = softpipe_texture_create;
+   screen->texture_blanket = softpipe_texture_blanket;
+   screen->texture_release = softpipe_texture_release;
+
+   screen->get_tex_surface = softpipe_get_tex_surface;
+   screen->tex_surface_release = softpipe_tex_surface_release;
+
+   screen->surface_map = softpipe_surface_map;
+   screen->surface_unmap = softpipe_surface_unmap;
+}
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
new file mode 100644
index 0000000000..c1636920cd
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEXTURE_H
+#define SP_TEXTURE_H
+
+
+#include "pipe/p_state.h"
+
+
+struct pipe_context;
+struct pipe_screen;
+struct softpipe_context;
+
+
+struct softpipe_texture
+{
+   struct pipe_texture base;
+
+   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned stride[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+
+   boolean modified;
+};
+
+
+/** cast wrapper */
+static INLINE struct softpipe_texture *
+softpipe_texture(struct pipe_texture *pt)
+{
+   return (struct softpipe_texture *) pt;
+}
+
+
+extern void
+softpipe_init_texture_funcs( struct softpipe_context *softpipe );
+
+extern void
+softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
+
+
+#endif /* SP_TEXTURE */
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
new file mode 100644
index 0000000000..ab76009375
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -0,0 +1,614 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Framebuffer/surface tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_texture.h"
+#include "sp_tile_cache.h"
+
+#define NUM_ENTRIES 32
+
+
+/** XXX move these */
+#define MAX_WIDTH 2048
+#define MAX_HEIGHT 2048
+
+
+struct softpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   void *surface_map;
+   struct pipe_texture *texture;  /**< if caching a texture */
+   struct softpipe_cached_tile entries[NUM_ENTRIES];
+   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
+   float clear_color[4];
+   uint clear_val;
+   boolean depth_stencil; /** Is the surface a depth/stencil format? */
+
+   struct pipe_surface *tex_surf;
+   void *tex_surf_map;
+   int tex_face, tex_level, tex_z;
+
+   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
+};
+
+
+/**
+ * Return the position in the cache for the tile that contains win pos (x,y).
+ * We currently use a direct mapped cache so this is like a hack key.
+ * At some point we should investige something more sophisticated, like
+ * a LRU replacement policy.
+ */
+#define CACHE_POS(x, y) \
+   (((x) / TILE_SIZE + ((y) / TILE_SIZE) * 5) % NUM_ENTRIES)
+
+
+
+/**
+ * Is the tile at (x,y) in cleared state?
+ */
+static INLINE uint
+is_clear_flag_set(const uint *bitvec, int x, int y)
+{
+   int pos, bit;
+   x /= TILE_SIZE;
+   y /= TILE_SIZE;
+   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
+   bit = bitvec[pos / 32] & (1 << (pos & 31));
+   return bit;
+}
+   
+
+/**
+ * Mark the tile at (x,y) as not cleared.
+ */
+static INLINE void
+clear_clear_flag(uint *bitvec, int x, int y)
+{
+   int pos;
+   x /= TILE_SIZE;
+   y /= TILE_SIZE;
+   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
+   bitvec[pos / 32] &= ~(1 << (pos & 31));
+}
+   
+
+struct softpipe_tile_cache *
+sp_create_tile_cache( struct pipe_screen *screen )
+{
+   struct softpipe_tile_cache *tc;
+   uint pos;
+
+   tc = CALLOC_STRUCT( softpipe_tile_cache );
+   if (tc) {
+      tc->screen = screen;
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].x =
+         tc->entries[pos].y = -1;
+      }
+   }
+   return tc;
+}
+
+
+void
+sp_destroy_tile_cache(struct softpipe_tile_cache *tc)
+{
+   uint pos;
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      /*assert(tc->entries[pos].x < 0);*/
+   }
+   if (tc->surface) {
+      pipe_surface_reference(&tc->surface, NULL);
+   }
+   if (tc->tex_surf) {
+      pipe_surface_reference(&tc->tex_surf, NULL);
+   }
+
+   FREE( tc );
+}
+
+
+/**
+ * Specify the surface to cache.
+ */
+void
+sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
+                          struct pipe_surface *ps)
+{
+   assert(!tc->texture);
+
+   if (tc->surface_map) {
+      tc->screen->surface_unmap(tc->screen, tc->surface);
+      tc->surface_map = NULL;
+   }
+
+   pipe_surface_reference(&tc->surface, ps);
+
+   if (tc->surface) {
+      if (tc->surface_map) /* XXX: this is always NULL!? */
+	 tc->surface_map = tc->screen->surface_map(tc->screen, tc->surface,
+                                                   PIPE_BUFFER_USAGE_CPU_READ | 
+                                                   PIPE_BUFFER_USAGE_CPU_WRITE);
+
+      tc->depth_stencil = (ps->format == PIPE_FORMAT_S8Z24_UNORM ||
+                           ps->format == PIPE_FORMAT_X8Z24_UNORM ||
+                           ps->format == PIPE_FORMAT_Z24S8_UNORM ||
+                           ps->format == PIPE_FORMAT_Z24X8_UNORM ||
+                           ps->format == PIPE_FORMAT_Z16_UNORM ||
+                           ps->format == PIPE_FORMAT_Z32_UNORM ||
+                           ps->format == PIPE_FORMAT_S8_UNORM);
+   }
+}
+
+
+/**
+ * Return the surface being cached.
+ */
+struct pipe_surface *
+sp_tile_cache_get_surface(struct softpipe_tile_cache *tc)
+{
+   return tc->surface;
+}
+
+
+void
+sp_tile_cache_map_surfaces(struct softpipe_tile_cache *tc)
+{
+   if (tc->surface && !tc->surface_map)
+      tc->surface_map = tc->screen->surface_map(tc->screen, tc->surface,
+                                                PIPE_BUFFER_USAGE_CPU_WRITE |
+                                                PIPE_BUFFER_USAGE_CPU_READ);
+
+   if (tc->tex_surf && !tc->tex_surf_map)
+      tc->tex_surf_map = tc->screen->surface_map(tc->screen, tc->tex_surf,
+                                                 PIPE_BUFFER_USAGE_CPU_READ);
+}
+
+
+void
+sp_tile_cache_unmap_surfaces(struct softpipe_tile_cache *tc)
+{
+   if (tc->surface_map) {
+      tc->screen->surface_unmap(tc->screen, tc->surface);
+      tc->surface_map = NULL;
+   }
+
+   if (tc->tex_surf_map) {
+      tc->screen->surface_unmap(tc->screen, tc->tex_surf);
+      tc->tex_surf_map = NULL;
+   }
+}
+
+
+/**
+ * Specify the texture to cache.
+ */
+void
+sp_tile_cache_set_texture(struct pipe_context *pipe,
+                          struct softpipe_tile_cache *tc,
+                          struct pipe_texture *texture)
+{
+   uint i;
+
+   assert(!tc->surface);
+
+   pipe_texture_reference(&tc->texture, texture);
+
+   if (tc->tex_surf_map) {
+      tc->screen->surface_unmap(tc->screen, tc->tex_surf);
+      tc->tex_surf_map = NULL;
+   }
+   pipe_surface_reference(&tc->tex_surf, NULL);
+
+   /* mark as entries as invalid/empty */
+   /* XXX we should try to avoid this when the teximage hasn't changed */
+   for (i = 0; i < NUM_ENTRIES; i++) {
+      tc->entries[i].x = -1;
+   }
+
+   tc->tex_face = -1; /* any invalid value here */
+}
+
+
+/**
+ * Set pixels in a tile to the given clear color/value, float.
+ */
+static void
+clear_tile_rgba(struct softpipe_cached_tile *tile,
+                enum pipe_format format,
+                const float clear_value[4])
+{
+   if (clear_value[0] == 0.0 &&
+       clear_value[1] == 0.0 &&
+       clear_value[2] == 0.0 &&
+       clear_value[3] == 0.0) {
+      memset(tile->data.color, 0, sizeof(tile->data.color));
+   }
+   else {
+      uint i, j;
+      for (i = 0; i < TILE_SIZE; i++) {
+         for (j = 0; j < TILE_SIZE; j++) {
+            tile->data.color[i][j][0] = clear_value[0];
+            tile->data.color[i][j][1] = clear_value[1];
+            tile->data.color[i][j][2] = clear_value[2];
+            tile->data.color[i][j][3] = clear_value[3];
+         }
+      }
+   }
+}
+
+
+/**
+ * Set a tile to a solid value/color.
+ */
+static void
+clear_tile(struct softpipe_cached_tile *tile,
+           enum pipe_format format,
+           uint clear_value)
+{
+   uint i, j;
+
+   switch (pf_get_size(format)) {
+   case 1:
+      memset(tile->data.any, 0, TILE_SIZE * TILE_SIZE);
+      break;
+   case 2:
+      if (clear_value == 0) {
+         memset(tile->data.any, 0, 2 * TILE_SIZE * TILE_SIZE);
+      }
+      else {
+         for (i = 0; i < TILE_SIZE; i++) {
+            for (j = 0; j < TILE_SIZE; j++) {
+               tile->data.depth16[i][j] = (ushort) clear_value;
+            }
+         }
+      }
+      break;
+   case 4:
+      if (clear_value == 0) {
+         memset(tile->data.any, 0, 4 * TILE_SIZE * TILE_SIZE);
+      }
+      else {
+         for (i = 0; i < TILE_SIZE; i++) {
+            for (j = 0; j < TILE_SIZE; j++) {
+               tile->data.color32[i][j] = clear_value;
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Actually clear the tiles which were flagged as being in a clear state.
+ */
+static void
+sp_tile_cache_flush_clear(struct pipe_context *pipe,
+                          struct softpipe_tile_cache *tc)
+{
+   struct pipe_surface *ps = tc->surface;
+   const uint w = tc->surface->width;
+   const uint h = tc->surface->height;
+   uint x, y;
+   uint numCleared = 0;
+
+   /* clear the scratch tile to the clear value */
+   clear_tile(&tc->tile, ps->format, tc->clear_val);
+
+   /* push the tile to all positions marked as clear */
+   for (y = 0; y < h; y += TILE_SIZE) {
+      for (x = 0; x < w; x += TILE_SIZE) {
+         if (is_clear_flag_set(tc->clear_flags, x, y)) {
+            pipe_put_tile_raw(ps,
+                              x, y, TILE_SIZE, TILE_SIZE,
+                              tc->tile.data.color32, 0/*STRIDE*/);
+
+            /* do this? */
+            clear_clear_flag(tc->clear_flags, x, y);
+
+            numCleared++;
+         }
+      }
+   }
+#if 0
+   debug_printf("num cleared: %u\n", numCleared);
+#endif
+}
+
+
+/**
+ * Flush the tile cache: write all dirty tiles back to the surface.
+ * any tiles "flagged" as cleared will be "really" cleared.
+ */
+void
+sp_flush_tile_cache(struct softpipe_context *softpipe,
+                    struct softpipe_tile_cache *tc)
+{
+   struct pipe_surface *ps = tc->surface;
+   int inuse = 0, pos;
+
+   if (ps) {
+      /* caching a drawing surface */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         struct softpipe_cached_tile *tile = tc->entries + pos;
+         if (tile->x >= 0) {
+            if (tc->depth_stencil) {
+               pipe_put_tile_raw(ps,
+                                 tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                 tile->data.depth32, 0/*STRIDE*/);
+            }
+            else {
+               pipe_put_tile_rgba(ps,
+                                  tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                  (float *) tile->data.color);
+            }
+            tile->x = tile->y = -1;  /* mark as empty */
+            inuse++;
+         }
+      }
+
+#if TILE_CLEAR_OPTIMIZATION
+      sp_tile_cache_flush_clear(&softpipe->pipe, tc);
+#endif
+   }
+   else if (tc->texture) {
+      /* caching a texture, mark all entries as empty */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].x = -1;
+      }
+      tc->tex_face = -1;
+   }
+
+#if 0
+   debug_printf("flushed tiles in use: %d\n", inuse);
+#endif
+}
+
+
+/**
+ * Get a tile from the cache.
+ * \param x, y  position of tile, in pixels
+ */
+struct softpipe_cached_tile *
+sp_get_cached_tile(struct softpipe_context *softpipe,
+                   struct softpipe_tile_cache *tc, int x, int y)
+{
+   struct pipe_surface *ps = tc->surface;
+
+   /* tile pos in framebuffer: */
+   const int tile_x = x & ~(TILE_SIZE - 1);
+   const int tile_y = y & ~(TILE_SIZE - 1);
+
+   /* cache pos/entry: */
+   const int pos = CACHE_POS(x, y);
+   struct softpipe_cached_tile *tile = tc->entries + pos;
+
+   if (tile_x != tile->x ||
+       tile_y != tile->y) {
+
+      if (tile->x != -1) {
+         /* put dirty tile back in framebuffer */
+         if (tc->depth_stencil) {
+            pipe_put_tile_raw(ps,
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
+         }
+         else {
+            pipe_put_tile_rgba(ps,
+                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               (float *) tile->data.color);
+         }
+      }
+
+      tile->x = tile_x;
+      tile->y = tile_y;
+
+      if (is_clear_flag_set(tc->clear_flags, x, y)) {
+         /* don't get tile from framebuffer, just clear it */
+         if (tc->depth_stencil) {
+            clear_tile(tile, ps->format, tc->clear_val);
+         }
+         else {
+            clear_tile_rgba(tile, ps->format, tc->clear_color);
+         }
+         clear_clear_flag(tc->clear_flags, x, y);
+      }
+      else {
+         /* get new tile data from surface */
+         if (tc->depth_stencil) {
+            pipe_get_tile_raw(ps,
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
+         }
+         else {
+            pipe_get_tile_rgba(ps,
+                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               (float *) tile->data.color);
+         }
+      }
+   }
+
+   return tile;
+}
+
+
+/**
+ * Given the texture face, level, zslice, x and y values, compute
+ * the cache entry position/index where we'd hope to find the
+ * cached texture tile.
+ * This is basically a direct-map cache.
+ * XXX There's probably lots of ways in which we can improve this.
+ */
+static INLINE uint
+tex_cache_pos(int x, int y, int z, int face, int level)
+{
+   uint entry = x + y * 5 + z * 4 + face + level;
+   return entry % NUM_ENTRIES;
+}
+
+
+/**
+ * Similar to sp_get_cached_tile() but for textures.
+ * Tiles are read-only and indexed with more params.
+ */
+const struct softpipe_cached_tile *
+sp_get_cached_tile_tex(struct softpipe_context *sp,
+                       struct softpipe_tile_cache *tc, int x, int y, int z,
+                       int face, int level)
+{
+   struct pipe_screen *screen = sp->pipe.screen;
+   /* tile pos in framebuffer: */
+   const int tile_x = x & ~(TILE_SIZE - 1);
+   const int tile_y = y & ~(TILE_SIZE - 1);
+   /* cache pos/entry: */
+   const uint pos = tex_cache_pos(x / TILE_SIZE, y / TILE_SIZE, z,
+                                  face, level);
+   struct softpipe_cached_tile *tile = tc->entries + pos;
+
+   if (tc->texture) {
+      struct softpipe_texture *spt = softpipe_texture(tc->texture);
+      if (spt->modified) {
+         /* texture was modified, force a cache reload */
+         tile->x = -1;
+         spt->modified = FALSE;
+      }
+   }
+
+   if (tile_x != tile->x ||
+       tile_y != tile->y ||
+       z != tile->z ||
+       face != tile->face ||
+       level != tile->level) {
+      /* cache miss */
+
+      /* check if we need to get a new surface */
+      if (!tc->tex_surf ||
+          tc->tex_face != face ||
+          tc->tex_level != level ||
+          tc->tex_z != z) {
+         /* get new surface (view into texture) */
+
+	 if (tc->tex_surf_map)
+            tc->screen->surface_unmap(tc->screen, tc->tex_surf);
+
+         tc->tex_surf = screen->get_tex_surface(screen, tc->texture, face, level, z, 
+                                                PIPE_BUFFER_USAGE_CPU_READ);
+         tc->tex_surf_map = screen->surface_map(screen, tc->tex_surf,
+                                                PIPE_BUFFER_USAGE_CPU_READ);
+
+         tc->tex_face = face;
+         tc->tex_level = level;
+         tc->tex_z = z;
+      }
+
+      /* get tile from the surface (view into texture) */
+      pipe_get_tile_rgba(tc->tex_surf,
+                         tile_x, tile_y, TILE_SIZE, TILE_SIZE,
+                         (float *) tile->data.color);
+      tile->x = tile_x;
+      tile->y = tile_y;
+      tile->z = z;
+      tile->face = face;
+      tile->level = level;
+   }
+
+   return tile;
+}
+
+
+/**
+ * When a whole surface is being cleared to a value we can avoid
+ * fetching tiles above.
+ * Save the color and set a 'clearflag' for each tile of the screen.
+ */
+void
+sp_tile_cache_clear(struct softpipe_tile_cache *tc, uint clearValue)
+{
+   uint r, g, b, a;
+   uint pos;
+
+   tc->clear_val = clearValue;
+
+   switch (tc->surface->format) {
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      r = (clearValue >> 24) & 0xff;
+      g = (clearValue >> 16) & 0xff;
+      b = (clearValue >>  8) & 0xff;
+      a = (clearValue      ) & 0xff;
+      break;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      r = (clearValue >> 16) & 0xff;
+      g = (clearValue >>  8) & 0xff;
+      b = (clearValue      ) & 0xff;
+      a = (clearValue >> 24) & 0xff;
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      r = (clearValue >>  8) & 0xff;
+      g = (clearValue >> 16) & 0xff;
+      b = (clearValue >> 24) & 0xff;
+      a = (clearValue      ) & 0xff;
+      break;
+   default:
+      r = g = b = a = 0;
+   }
+
+   tc->clear_color[0] = r / 255.0f;
+   tc->clear_color[1] = g / 255.0f;
+   tc->clear_color[2] = b / 255.0f;
+   tc->clear_color[3] = a / 255.0f;
+
+#if TILE_CLEAR_OPTIMIZATION
+   /* set flags to indicate all the tiles are cleared */
+   memset(tc->clear_flags, 255, sizeof(tc->clear_flags));
+#else
+   /* disable the optimization */
+   memset(tc->clear_flags, 0, sizeof(tc->clear_flags));
+#endif
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      struct softpipe_cached_tile *tile = tc->entries + pos;
+      tile->x = tile->y = -1;
+   }
+}
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
new file mode 100644
index 0000000000..a66bb50bcc
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -0,0 +1,105 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TILE_CACHE_H
+#define SP_TILE_CACHE_H
+
+#define TILE_CLEAR_OPTIMIZATION 1
+
+
+#include "pipe/p_compiler.h"
+
+
+struct softpipe_context;
+struct softpipe_tile_cache;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_SIZE 64
+
+
+
+struct softpipe_cached_tile
+{
+   int x, y;           /**< pos of tile in window coords */
+   int z, face, level; /**< Extra texture indexes */
+   union {
+      float color[TILE_SIZE][TILE_SIZE][4];
+      uint color32[TILE_SIZE][TILE_SIZE];
+      uint depth32[TILE_SIZE][TILE_SIZE];
+      ushort depth16[TILE_SIZE][TILE_SIZE];
+      ubyte stencil8[TILE_SIZE][TILE_SIZE];
+      ubyte any[1];
+   } data;
+};
+
+
+extern struct softpipe_tile_cache *
+sp_create_tile_cache( struct pipe_screen *screen );
+
+extern void
+sp_destroy_tile_cache(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
+                          struct pipe_surface *sps);
+
+extern struct pipe_surface *
+sp_tile_cache_get_surface(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_map_surfaces(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_unmap_surfaces(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_set_texture(struct pipe_context *pipe,
+                          struct softpipe_tile_cache *tc,
+                          struct pipe_texture *texture);
+
+extern void
+sp_flush_tile_cache(struct softpipe_context *softpipe,
+                    struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_clear(struct softpipe_tile_cache *tc, uint clearValue);
+
+extern struct softpipe_cached_tile *
+sp_get_cached_tile(struct softpipe_context *softpipe,
+                   struct softpipe_tile_cache *tc, int x, int y);
+
+extern const struct softpipe_cached_tile *
+sp_get_cached_tile_tex(struct softpipe_context *softpipe,
+                       struct softpipe_tile_cache *tc, int x, int y, int z,
+                       int face, int level);
+
+
+#endif /* SP_TILE_CACHE_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_winsys.h b/src/gallium/drivers/softpipe/sp_winsys.h
new file mode 100644
index 0000000000..4ab666486c
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_winsys.h
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* This is the interface that softpipe requires any window system
+ * hosting it to implement.  This is the only include file in softpipe
+ * which is public.
+ */
+
+
+#ifndef SP_WINSYS_H
+#define SP_WINSYS_H
+
+
+#include "pipe/p_compiler.h" /* for boolean */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+enum pipe_format;
+
+struct softpipe_winsys {
+   /** test if the given format is supported for front/back color bufs */
+   boolean (*is_format_supported)( struct softpipe_winsys *sws,
+                                   enum pipe_format format );
+
+};
+
+struct pipe_screen;
+struct pipe_winsys;
+struct pipe_context;
+
+
+struct pipe_context *softpipe_create( struct pipe_screen *,
+                                      struct pipe_winsys *,
+				      void *unused );
+
+
+struct pipe_screen *
+softpipe_create_screen(struct pipe_winsys *);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SP_WINSYS_H */
diff --git a/src/gallium/drivers/trace/Makefile b/src/gallium/drivers/trace/Makefile
new file mode 100644
index 0000000000..3859b8acb0
--- /dev/null
+++ b/src/gallium/drivers/trace/Makefile
@@ -0,0 +1,18 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = trace
+
+C_SOURCES = \
+	tr_context.c \
+	tr_dump.c \
+	tr_screen.c \
+	tr_state.c \
+	tr_texture.c \
+	tr_winsys.c 
+
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/trace/README b/src/gallium/drivers/trace/README
new file mode 100644
index 0000000000..f0e1cd596d
--- /dev/null
+++ b/src/gallium/drivers/trace/README
@@ -0,0 +1,64 @@
+                             TRACE PIPE DRIVER
+
+
+= About =
+
+This directory contains a Gallium3D pipe driver which traces all incoming calls.
+
+
+= Build Instructions =
+
+To build, invoke scons on the top dir as
+ 
+ scons statetrackers=mesa drivers=softpipe,i965simple,trace winsys=xlib
+
+
+= Usage =
+
+To use do
+
+ ln -s libGL.so build/linux-x86-debug/gallium/winsys/xlib/libGL.so.1
+ export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/gallium/winsys/xlib
+
+ensure the right libGL.so is being picked by doing
+
+ ldd progs/trivial/tri 
+
+and then try running
+
+ GALLIUM_TRACE=tri.trace progs/trivial/tri
+
+which should create a tri.trace file, which is an XML file. You can view copying 
+trace.xsl to the same directory, and opening with a XSLT capable browser such as 
+Firefox or Internet Explorer.
+
+
+= Integrating =
+
+You can integrate the trace pipe driver either inside the state tracker or the 
+winsys. The procedure on both cases is the same. Let's assume you have a 
+pipe_screen and a pipe_context pair obtained by the usual means (variable and 
+function names are just for illustration purposes):
+
+  real_screen = real_screen_create(...);
+  
+  real_context = real_context_create(...);
+  
+The trace screen and pipe_context is then created by doing
+
+  trace_screen = trace_screen_create(real_screen);
+  
+  trace_context = trace_context_create(trace_screen, real_context);
+  
+You can then simply use trace_screen and trace_context instead of real_screen
+and real_context.
+
+Do not call trace_winsys_create. Simply pass trace_screen->winsys or 
+trace_context->winsys in places you would pass winsys.
+
+You can create as many contexts you wish. Just ensure that you don't mistake 
+trace_screen with real_screen when creating them.
+
+
+--
+Jose Fonseca <jrfonseca@tungstengraphics.com>
diff --git a/src/gallium/drivers/trace/SConscript b/src/gallium/drivers/trace/SConscript
new file mode 100644
index 0000000000..0a6bfb8f4c
--- /dev/null
+++ b/src/gallium/drivers/trace/SConscript
@@ -0,0 +1,16 @@
+Import('*')
+
+env = env.Clone()
+
+trace = env.ConvenienceLibrary(
+    target = 'trace',
+    source = [
+        'tr_context.c',
+        'tr_dump.c',
+        'tr_screen.c',
+        'tr_state.c',
+        'tr_texture.c',
+        'tr_winsys.c',
+    ])
+
+Export('trace')
+\ No newline at end of file
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
new file mode 100644
index 0000000000..ec8be27077
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -0,0 +1,1072 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_screen.h"
+
+#include "tr_dump.h"
+#include "tr_state.h"
+#include "tr_screen.h"
+#include "tr_texture.h"
+#include "tr_winsys.h"
+#include "tr_context.h"
+
+
+static INLINE struct pipe_texture * 
+trace_texture_unwrap(struct trace_context *tr_ctx,
+                     struct pipe_texture *texture)
+{
+   struct trace_screen *tr_scr = trace_screen(tr_ctx->base.screen); 
+   struct trace_texture *tr_tex;
+   
+   if(!texture)
+      return NULL;
+   
+   tr_tex = trace_texture(tr_scr, texture);
+   
+   assert(tr_tex->texture);
+   assert(tr_tex->texture->screen == tr_scr->screen);
+   return tr_tex->texture;
+}
+
+
+static INLINE struct pipe_surface * 
+trace_surface_unwrap(struct trace_context *tr_ctx,
+                     struct pipe_surface *surface)
+{
+   struct trace_screen *tr_scr = trace_screen(tr_ctx->base.screen); 
+   struct trace_texture *tr_tex;
+   struct trace_surface *tr_surf;
+   
+   if(!surface)
+      return NULL;
+
+   assert(surface->texture);
+   if(!surface->texture)
+      return surface;
+   
+   tr_tex = trace_texture(tr_scr, surface->texture);
+   tr_surf = trace_surface(tr_tex, surface);
+   
+   assert(tr_surf->surface);
+   assert(tr_surf->surface->texture->screen == tr_scr->screen);
+   return tr_surf->surface;
+}
+
+
+static INLINE void
+trace_context_set_edgeflags(struct pipe_context *_pipe,
+                            const unsigned *bitfield)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_edgeflags");
+   
+   trace_dump_arg(ptr, pipe);
+   /* FIXME: we don't know how big this array is */
+   trace_dump_arg(ptr, bitfield);
+
+   pipe->set_edgeflags(pipe, bitfield);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE boolean
+trace_context_draw_arrays(struct pipe_context *_pipe,
+                          unsigned mode, unsigned start, unsigned count)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   boolean result;
+
+   trace_dump_call_begin("pipe_context", "draw_arrays");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, mode);
+   trace_dump_arg(uint, start);
+   trace_dump_arg(uint, count);
+
+   result = pipe->draw_arrays(pipe, mode, start, count);;
+
+   trace_dump_ret(bool, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE boolean
+trace_context_draw_elements(struct pipe_context *_pipe,
+                          struct pipe_buffer *indexBuffer,
+                          unsigned indexSize,
+                          unsigned mode, unsigned start, unsigned count)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   boolean result;
+
+   trace_winsys_user_buffer_update(_pipe->winsys, indexBuffer);
+
+   trace_dump_call_begin("pipe_context", "draw_elements");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, indexBuffer);
+   trace_dump_arg(uint, indexSize);
+   trace_dump_arg(uint, mode);
+   trace_dump_arg(uint, start);
+   trace_dump_arg(uint, count);
+
+   result = pipe->draw_elements(pipe, indexBuffer, indexSize, mode, start, count);;
+
+   trace_dump_ret(bool, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE boolean
+trace_context_draw_range_elements(struct pipe_context *_pipe,
+                                  struct pipe_buffer *indexBuffer,
+                                  unsigned indexSize,
+                                  unsigned minIndex,
+                                  unsigned maxIndex,
+                                  unsigned mode, 
+                                  unsigned start, 
+                                  unsigned count)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   boolean result;
+
+   trace_winsys_user_buffer_update(_pipe->winsys, indexBuffer);
+
+   trace_dump_call_begin("pipe_context", "draw_range_elements");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, indexBuffer);
+   trace_dump_arg(uint, indexSize);
+   trace_dump_arg(uint, minIndex);
+   trace_dump_arg(uint, maxIndex);
+   trace_dump_arg(uint, mode);
+   trace_dump_arg(uint, start);
+   trace_dump_arg(uint, count);
+
+   result = pipe->draw_range_elements(pipe, 
+                                      indexBuffer, 
+                                      indexSize, minIndex, maxIndex, 
+                                      mode, start, count);
+   
+   trace_dump_ret(bool, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE struct pipe_query *
+trace_context_create_query(struct pipe_context *_pipe,
+                           unsigned query_type)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_query *result;
+
+   trace_dump_call_begin("pipe_context", "create_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, query_type);
+
+   result = pipe->create_query(pipe, query_type);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_destroy_query(struct pipe_context *_pipe,
+                            struct pipe_query *query)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "destroy_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, query);
+
+   pipe->destroy_query(pipe, query);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_begin_query(struct pipe_context *_pipe, 
+                          struct pipe_query *query)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "begin_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, query);
+
+   pipe->begin_query(pipe, query);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_end_query(struct pipe_context *_pipe, 
+                        struct pipe_query *query)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "end_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, query);
+
+   pipe->end_query(pipe, query);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE boolean
+trace_context_get_query_result(struct pipe_context *_pipe, 
+                               struct pipe_query *query,
+                               boolean wait,
+                               uint64_t *presult)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   uint64_t result;
+   boolean _result;
+
+   trace_dump_call_begin("pipe_context", "get_query_result");
+
+   trace_dump_arg(ptr, pipe);
+
+   _result = pipe->get_query_result(pipe, query, wait, presult);;
+   result = *presult;
+
+   trace_dump_arg(uint, result);
+   trace_dump_ret(bool, _result);
+   
+   trace_dump_call_end();
+   
+   return _result;
+}
+
+
+static INLINE void *
+trace_context_create_blend_state(struct pipe_context *_pipe,
+                                 const struct pipe_blend_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_blend_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(blend_state, state);
+
+   result = pipe->create_blend_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_blend_state(struct pipe_context *_pipe, 
+                               void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_blend_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_blend_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_blend_state(struct pipe_context *_pipe, 
+                                 void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_blend_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_blend_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_sampler_state(struct pipe_context *_pipe,
+                                   const struct pipe_sampler_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_sampler_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(sampler_state, state);
+
+   result = pipe->create_sampler_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_sampler_states(struct pipe_context *_pipe, 
+                                  unsigned num_states, void **states)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_sampler_states");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_states);
+   trace_dump_arg_array(ptr, states, num_states);
+
+   pipe->bind_sampler_states(pipe, num_states, states);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_sampler_state(struct pipe_context *_pipe, 
+                                   void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_sampler_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_sampler_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_rasterizer_state(struct pipe_context *_pipe,
+                                      const struct pipe_rasterizer_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_rasterizer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(rasterizer_state, state);
+
+   result = pipe->create_rasterizer_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_rasterizer_state(struct pipe_context *_pipe, 
+                                    void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_rasterizer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_rasterizer_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_rasterizer_state(struct pipe_context *_pipe, 
+                                      void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_rasterizer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_rasterizer_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                               const struct pipe_depth_stencil_alpha_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_depth_stencil_alpha_state");
+
+   result = pipe->create_depth_stencil_alpha_state(pipe, state);;
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(depth_stencil_alpha_state, state);
+   
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe, 
+                                             void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_depth_stencil_alpha_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_depth_stencil_alpha_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe, 
+                                               void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_depth_stencil_alpha_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_depth_stencil_alpha_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_fs_state(struct pipe_context *_pipe,
+                              const struct pipe_shader_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_fs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(shader_state, state);
+
+   result = pipe->create_fs_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_fs_state(struct pipe_context *_pipe, 
+                            void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_fs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_fs_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_fs_state(struct pipe_context *_pipe, 
+                              void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_fs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_fs_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_vs_state(struct pipe_context *_pipe,
+                              const struct pipe_shader_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_vs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(shader_state, state);
+
+   result = pipe->create_vs_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_vs_state(struct pipe_context *_pipe, 
+                            void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_vs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_vs_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_vs_state(struct pipe_context *_pipe, 
+                              void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_vs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_vs_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_blend_color(struct pipe_context *_pipe,
+                              const struct pipe_blend_color *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_blend_color");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(blend_color, state);
+
+   pipe->set_blend_color(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_clip_state(struct pipe_context *_pipe,
+                             const struct pipe_clip_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_clip_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(clip_state, state);
+
+   pipe->set_clip_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_constant_buffer(struct pipe_context *_pipe,
+                                  uint shader, uint index,
+                                  const struct pipe_constant_buffer *buffer)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_winsys_user_buffer_update(_pipe->winsys, (struct pipe_buffer *)buffer);
+   
+   trace_dump_call_begin("pipe_context", "set_constant_buffer");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, shader);
+   trace_dump_arg(uint, index);
+   trace_dump_arg(constant_buffer, buffer);
+
+   pipe->set_constant_buffer(pipe, shader, index, buffer);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_framebuffer_state(struct pipe_context *_pipe,
+                                    const struct pipe_framebuffer_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_framebuffer_state unwrapped_state;
+   unsigned i;
+   
+   /* Unwrap the input state */
+   memcpy(&unwrapped_state, state, sizeof(unwrapped_state));
+   for(i = 0; i < state->nr_cbufs; ++i)
+      unwrapped_state.cbufs[i] = trace_surface_unwrap(tr_ctx, state->cbufs[i]);
+   for(i = state->nr_cbufs; i < PIPE_MAX_COLOR_BUFS; ++i)
+      unwrapped_state.cbufs[i] = NULL;
+   unwrapped_state.zsbuf = trace_surface_unwrap(tr_ctx, state->zsbuf);
+   state = &unwrapped_state;
+   
+   trace_dump_call_begin("pipe_context", "set_framebuffer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(framebuffer_state, state);
+
+   pipe->set_framebuffer_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_polygon_stipple(struct pipe_context *_pipe,
+                                  const struct pipe_poly_stipple *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_polygon_stipple");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(poly_stipple, state);
+
+   pipe->set_polygon_stipple(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_scissor_state(struct pipe_context *_pipe,
+                                const struct pipe_scissor_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_scissor_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(scissor_state, state);
+
+   pipe->set_scissor_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_viewport_state(struct pipe_context *_pipe,
+                                 const struct pipe_viewport_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_viewport_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(viewport_state, state);
+
+   pipe->set_viewport_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_sampler_textures(struct pipe_context *_pipe,
+                                   unsigned num_textures,
+                                   struct pipe_texture **textures)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_texture *unwrapped_textures[PIPE_MAX_SAMPLERS];
+   unsigned i;
+   
+   for(i = 0; i < num_textures; ++i)
+      unwrapped_textures[i] = trace_texture_unwrap(tr_ctx, textures[i]);
+   textures = unwrapped_textures;
+
+   trace_dump_call_begin("pipe_context", "set_sampler_textures");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_textures);
+   trace_dump_arg_array(ptr, textures, num_textures);
+
+   pipe->set_sampler_textures(pipe, num_textures, textures);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_vertex_buffers(struct pipe_context *_pipe,
+                                 unsigned num_buffers,
+                                 const struct pipe_vertex_buffer *buffers)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   unsigned i;
+
+   for(i = 0; i < num_buffers; ++i)
+      trace_winsys_user_buffer_update(_pipe->winsys, buffers[i].buffer);
+
+   trace_dump_call_begin("pipe_context", "set_vertex_buffers");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_buffers);
+   
+   trace_dump_arg_begin("buffers");
+   trace_dump_struct_array(vertex_buffer, buffers, num_buffers);
+   trace_dump_arg_end();
+
+   pipe->set_vertex_buffers(pipe, num_buffers, buffers);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_vertex_elements(struct pipe_context *_pipe,
+                                  unsigned num_elements,
+                                  const struct pipe_vertex_element *elements)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_vertex_elements");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_elements);
+
+   trace_dump_arg_begin("elements");
+   trace_dump_struct_array(vertex_element, elements, num_elements);
+   trace_dump_arg_end();
+
+   pipe->set_vertex_elements(pipe, num_elements, elements);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_surface_copy(struct pipe_context *_pipe,
+                           boolean do_flip,
+                           struct pipe_surface *dest,
+                           unsigned destx, unsigned desty,
+                           struct pipe_surface *src,
+                           unsigned srcx, unsigned srcy,
+                           unsigned width, unsigned height)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   dest = trace_surface_unwrap(tr_ctx, dest);
+   src = trace_surface_unwrap(tr_ctx, src);
+   
+   trace_dump_call_begin("pipe_context", "surface_copy");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(bool, do_flip);
+   trace_dump_arg(ptr, dest);
+   trace_dump_arg(uint, destx);
+   trace_dump_arg(uint, desty);
+   trace_dump_arg(ptr, src);
+   trace_dump_arg(uint, srcx);
+   trace_dump_arg(uint, srcy);
+   trace_dump_arg(uint, width);
+   trace_dump_arg(uint, height);
+
+   pipe->surface_copy(pipe, do_flip, 
+                      dest, destx, desty, 
+                      src, srcx, srcy, width, height);
+   
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_surface_fill(struct pipe_context *_pipe,
+                           struct pipe_surface *dst,
+                           unsigned dstx, unsigned dsty,
+                           unsigned width, unsigned height,
+                           unsigned value)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   dst = trace_surface_unwrap(tr_ctx, dst);
+
+   trace_dump_call_begin("pipe_context", "surface_fill");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, dst);
+   trace_dump_arg(uint, dstx);
+   trace_dump_arg(uint, dsty);
+   trace_dump_arg(uint, width);
+   trace_dump_arg(uint, height);
+
+   pipe->surface_fill(pipe, dst, dstx, dsty, width, height, value);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_clear(struct pipe_context *_pipe, 
+                    struct pipe_surface *surface,
+                    unsigned clearValue)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   surface = trace_surface_unwrap(tr_ctx, surface);
+
+   trace_dump_call_begin("pipe_context", "clear");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, surface);
+   trace_dump_arg(uint, clearValue);
+
+   pipe->clear(pipe, surface, clearValue);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_flush(struct pipe_context *_pipe,
+                    unsigned flags,
+                    struct pipe_fence_handle **fence)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "flush");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, flags);
+
+   pipe->flush(pipe, flags, fence);;
+
+   if(fence)
+      trace_dump_ret(ptr, *fence);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_destroy(struct pipe_context *_pipe)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "destroy");
+
+   trace_dump_arg(ptr, pipe);
+
+   pipe->destroy(pipe);
+   
+   trace_dump_call_end();
+
+   FREE(tr_ctx);
+}
+
+
+struct pipe_context *
+trace_context_create(struct pipe_screen *screen, 
+                     struct pipe_context *pipe)
+{
+   struct trace_context *tr_ctx;
+   
+   if(!pipe)
+      goto error1;
+   
+   if(!trace_dump_enabled())
+      goto error1;
+   
+   tr_ctx = CALLOC_STRUCT(trace_context);
+   if(!tr_ctx)
+      goto error1;
+
+   tr_ctx->base.winsys = screen->winsys;
+   tr_ctx->base.screen = screen;
+   tr_ctx->base.destroy = trace_context_destroy;
+   tr_ctx->base.set_edgeflags = trace_context_set_edgeflags;
+   tr_ctx->base.draw_arrays = trace_context_draw_arrays;
+   tr_ctx->base.draw_elements = trace_context_draw_elements;
+   tr_ctx->base.draw_range_elements = trace_context_draw_range_elements;
+   tr_ctx->base.create_query = trace_context_create_query;
+   tr_ctx->base.destroy_query = trace_context_destroy_query;
+   tr_ctx->base.begin_query = trace_context_begin_query;
+   tr_ctx->base.end_query = trace_context_end_query;
+   tr_ctx->base.get_query_result = trace_context_get_query_result;
+   tr_ctx->base.create_blend_state = trace_context_create_blend_state;
+   tr_ctx->base.bind_blend_state = trace_context_bind_blend_state;
+   tr_ctx->base.delete_blend_state = trace_context_delete_blend_state;
+   tr_ctx->base.create_sampler_state = trace_context_create_sampler_state;
+   tr_ctx->base.bind_sampler_states = trace_context_bind_sampler_states;
+   tr_ctx->base.delete_sampler_state = trace_context_delete_sampler_state;
+   tr_ctx->base.create_rasterizer_state = trace_context_create_rasterizer_state;
+   tr_ctx->base.bind_rasterizer_state = trace_context_bind_rasterizer_state;
+   tr_ctx->base.delete_rasterizer_state = trace_context_delete_rasterizer_state;
+   tr_ctx->base.create_depth_stencil_alpha_state = trace_context_create_depth_stencil_alpha_state;
+   tr_ctx->base.bind_depth_stencil_alpha_state = trace_context_bind_depth_stencil_alpha_state;
+   tr_ctx->base.delete_depth_stencil_alpha_state = trace_context_delete_depth_stencil_alpha_state;
+   tr_ctx->base.create_fs_state = trace_context_create_fs_state;
+   tr_ctx->base.bind_fs_state = trace_context_bind_fs_state;
+   tr_ctx->base.delete_fs_state = trace_context_delete_fs_state;
+   tr_ctx->base.create_vs_state = trace_context_create_vs_state;
+   tr_ctx->base.bind_vs_state = trace_context_bind_vs_state;
+   tr_ctx->base.delete_vs_state = trace_context_delete_vs_state;
+   tr_ctx->base.set_blend_color = trace_context_set_blend_color;
+   tr_ctx->base.set_clip_state = trace_context_set_clip_state;
+   tr_ctx->base.set_constant_buffer = trace_context_set_constant_buffer;
+   tr_ctx->base.set_framebuffer_state = trace_context_set_framebuffer_state;
+   tr_ctx->base.set_polygon_stipple = trace_context_set_polygon_stipple;
+   tr_ctx->base.set_scissor_state = trace_context_set_scissor_state;
+   tr_ctx->base.set_viewport_state = trace_context_set_viewport_state;
+   tr_ctx->base.set_sampler_textures = trace_context_set_sampler_textures;
+   tr_ctx->base.set_vertex_buffers = trace_context_set_vertex_buffers;
+   tr_ctx->base.set_vertex_elements = trace_context_set_vertex_elements;
+   tr_ctx->base.surface_copy = trace_context_surface_copy;
+   tr_ctx->base.surface_fill = trace_context_surface_fill;
+   tr_ctx->base.clear = trace_context_clear;
+   tr_ctx->base.flush = trace_context_flush;
+
+   tr_ctx->pipe = pipe;
+   
+   trace_dump_call_begin("", "pipe_context_create");
+   trace_dump_arg_begin("screen");
+   trace_dump_ptr(pipe->screen);
+   trace_dump_arg_end();
+   trace_dump_ret(ptr, pipe);
+   trace_dump_call_end();
+
+   return &tr_ctx->base;
+   
+error1:
+   return pipe;
+}
diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h
new file mode 100644
index 0000000000..7831900ec2
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_context.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_CONTEXT_H_
+#define TR_CONTEXT_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_context.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+struct trace_context
+{
+   struct pipe_context base;
+   
+   struct pipe_context *pipe;
+};
+
+
+static INLINE struct trace_context *
+trace_context(struct pipe_context *pipe)
+{
+   assert(pipe);
+   return (struct trace_context *)pipe;
+}
+
+
+
+struct pipe_context *
+trace_context_create(struct pipe_screen *screen,
+                     struct pipe_context *pipe);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TR_CONTEXT_H_ */
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
new file mode 100644
index 0000000000..a0ead0ded3
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -0,0 +1,404 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Trace dumping functions.
+ * 
+ * For now we just use standard XML for dumping the trace calls, as this is
+ * simple to write, parse, and visually inspect, but the actual representation 
+ * is abstracted out of this file, so that we can switch to a binary 
+ * representation if/when it becomes justified.
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>   
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_OS_LINUX)
+#include <stdlib.h>
+#endif
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "util/u_stream.h"
+
+#include "tr_dump.h"
+
+
+static struct util_stream *stream = NULL;
+static unsigned refcount = 0;
+
+
+static INLINE void 
+trace_dump_write(const char *buf, size_t size)
+{
+   if(stream)
+      util_stream_write(stream, buf, size);
+}
+
+
+static INLINE void 
+trace_dump_writes(const char *s)
+{
+   trace_dump_write(s, strlen(s));
+}
+
+
+static INLINE void 
+trace_dump_writef(const char *format, ...)
+{
+   static char buf[1024];
+   unsigned len;
+   va_list ap;
+   va_start(ap, format);
+   len = util_vsnprintf(buf, sizeof(buf), format, ap);
+   va_end(ap);
+   trace_dump_write(buf, len);
+}
+
+
+static INLINE void 
+trace_dump_escape(const char *str) 
+{
+   const unsigned char *p = (const unsigned char *)str;
+   unsigned char c;
+   while((c = *p++) != 0) {
+      if(c == '<')
+         trace_dump_writes("&lt;");
+      else if(c == '>')
+         trace_dump_writes("&gt;");
+      else if(c == '&')
+         trace_dump_writes("&amp;");
+      else if(c == '\'')
+         trace_dump_writes("&apos;");
+      else if(c == '\"')
+         trace_dump_writes("&quot;");
+      else if(c >= 0x20 && c <= 0x7e)
+         trace_dump_writef("%c", c);
+      else
+         trace_dump_writef("&#%u;", c);
+   }
+}
+
+
+static INLINE void 
+trace_dump_indent(unsigned level)
+{
+   unsigned i;
+   for(i = 0; i < level; ++i)
+      trace_dump_writes("\t");
+}
+
+
+static INLINE void 
+trace_dump_newline(void) 
+{
+   trace_dump_writes("\n");
+}
+
+
+static INLINE void 
+trace_dump_tag(const char *name)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes("/>");
+}
+
+
+static INLINE void 
+trace_dump_tag_begin(const char *name)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(">");
+}
+
+static INLINE void 
+trace_dump_tag_begin1(const char *name, 
+                      const char *attr1, const char *value1)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(" ");
+   trace_dump_writes(attr1);
+   trace_dump_writes("='");
+   trace_dump_escape(value1);
+   trace_dump_writes("'>");
+}
+
+
+static INLINE void 
+trace_dump_tag_begin2(const char *name, 
+                      const char *attr1, const char *value1,
+                      const char *attr2, const char *value2)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(" ");
+   trace_dump_writes(attr1);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value1);
+   trace_dump_writes("\' ");
+   trace_dump_writes(attr2);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value2);
+   trace_dump_writes("\'>");
+}
+
+
+static INLINE void 
+trace_dump_tag_begin3(const char *name, 
+                      const char *attr1, const char *value1,
+                      const char *attr2, const char *value2,
+                      const char *attr3, const char *value3)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(" ");
+   trace_dump_writes(attr1);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value1);
+   trace_dump_writes("\' ");
+   trace_dump_writes(attr2);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value2);
+   trace_dump_writes("\' ");
+   trace_dump_writes(attr3);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value3);
+   trace_dump_writes("\'>");
+}
+
+
+static INLINE void
+trace_dump_tag_end(const char *name)
+{
+   trace_dump_writes("</");
+   trace_dump_writes(name);
+   trace_dump_writes(">");
+}
+
+static void 
+trace_dump_trace_close(void)
+{
+   if(stream) {
+      trace_dump_writes("</trace>\n");
+      util_stream_close(stream);
+      stream = NULL;
+      refcount = 0;
+   }
+}
+
+boolean trace_dump_trace_begin()
+{
+   const char *filename;
+   
+   filename = debug_get_option("GALLIUM_TRACE", NULL);
+   if(!filename)
+      return FALSE;
+   
+   if(!stream) {
+   
+      stream = util_stream_create(filename, 0);
+      if(!stream)
+         return FALSE;
+      
+      trace_dump_writes("<?xml version='1.0' encoding='UTF-8'?>\n");
+      trace_dump_writes("<?xml-stylesheet type='text/xsl' href='trace.xsl'?>\n");
+      trace_dump_writes("<trace version='0.1'>\n");
+      
+#if defined(PIPE_OS_LINUX)
+      /* Linux applications rarely cleanup GL / Gallium resources so catch 
+       * application exit here */ 
+      atexit(trace_dump_trace_close);
+#endif
+   }
+   
+   ++refcount;
+   
+   return TRUE;
+}
+
+boolean trace_dump_enabled(void)
+{
+   return stream ? TRUE : FALSE;
+}
+
+void trace_dump_trace_end(void)
+{
+   if(stream)
+      if(!--refcount)
+         trace_dump_trace_close();
+}
+
+void trace_dump_call_begin(const char *klass, const char *method)
+{
+   trace_dump_indent(1);
+   trace_dump_tag_begin2("call", "class", klass, "method", method);
+   trace_dump_newline();
+}
+
+void trace_dump_call_end(void)
+{
+   trace_dump_indent(1);
+   trace_dump_tag_end("call");
+   trace_dump_newline();
+   util_stream_flush(stream);
+}
+
+void trace_dump_arg_begin(const char *name)
+{
+   trace_dump_indent(2);
+   trace_dump_tag_begin1("arg", "name", name);
+}
+
+void trace_dump_arg_end(void)
+{
+   trace_dump_tag_end("arg");
+   trace_dump_newline();
+}
+
+void trace_dump_ret_begin(void)
+{
+   trace_dump_indent(2);
+   trace_dump_tag_begin("ret");
+}
+
+void trace_dump_ret_end(void)
+{
+   trace_dump_tag_end("ret");
+   trace_dump_newline();
+}
+
+void trace_dump_bool(int value)
+{
+   trace_dump_writef("<bool>%c</bool>", value ? '1' : '0');
+}
+
+void trace_dump_int(long long int value)
+{
+   trace_dump_writef("<int>%lli</int>", value);
+}
+
+void trace_dump_uint(long long unsigned value)
+{
+   trace_dump_writef("<uint>%llu</uint>", value);
+}
+
+void trace_dump_float(double value)
+{
+   trace_dump_writef("<float>%g</float>", value);
+}
+
+void trace_dump_bytes(const void *data,
+                      long unsigned size)
+{
+   static const char hex_table[16] = "0123456789ABCDEF";
+   const uint8_t *p = data;
+   long unsigned i;
+   trace_dump_writes("<bytes>");
+   for(i = 0; i < size; ++i) {
+      uint8_t byte = *p++;
+      char hex[2];
+      hex[0] = hex_table[byte >> 4];
+      hex[1] = hex_table[byte & 0xf];
+      trace_dump_write(hex, 2);
+   }
+   trace_dump_writes("</bytes>");
+}
+
+void trace_dump_string(const char *str)
+{
+   trace_dump_writes("<string>");
+   trace_dump_escape(str);
+   trace_dump_writes("</string>");
+}
+
+void trace_dump_enum(const char *value)
+{
+   trace_dump_writes("<enum>");
+   trace_dump_escape(value);
+   trace_dump_writes("</enum>");
+}
+
+void trace_dump_array_begin(void)
+{
+   trace_dump_writes("<array>");
+}
+
+void trace_dump_array_end(void)
+{
+   trace_dump_writes("</array>");
+}
+
+void trace_dump_elem_begin(void)
+{
+   trace_dump_writes("<elem>");
+}
+
+void trace_dump_elem_end(void)
+{
+   trace_dump_writes("</elem>");
+}
+
+void trace_dump_struct_begin(const char *name)
+{
+   trace_dump_writef("<struct name='%s'>", name);
+}
+
+void trace_dump_struct_end(void)
+{
+   trace_dump_writes("</struct>");
+}
+
+void trace_dump_member_begin(const char *name)
+{
+   trace_dump_writef("<member name='%s'>", name);
+}
+
+void trace_dump_member_end(void)
+{
+   trace_dump_writes("</member>");
+}
+
+void trace_dump_null(void)
+{
+   trace_dump_writes("<null/>");
+}
+
+void trace_dump_ptr(const void *value)
+{
+   if(value)
+      trace_dump_writef("<ptr>0x%08lx</ptr>", (unsigned long)(uintptr_t)value);
+   else
+      trace_dump_null();
+}
diff --git a/src/gallium/drivers/trace/tr_dump.h b/src/gallium/drivers/trace/tr_dump.h
new file mode 100644
index 0000000000..76a53731b3
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_dump.h
@@ -0,0 +1,132 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Trace data dumping primitives.
+ */
+
+#ifndef TR_DUMP_H
+#define TR_DUMP_H
+
+
+#include "pipe/p_compiler.h"
+
+
+boolean trace_dump_trace_begin(void);
+boolean trace_dump_enabled(void);
+void trace_dump_trace_end(void);
+void trace_dump_call_begin(const char *klass, const char *method);
+void trace_dump_call_end(void);
+void trace_dump_arg_begin(const char *name);
+void trace_dump_arg_end(void);
+void trace_dump_ret_begin(void);
+void trace_dump_ret_end(void);
+void trace_dump_bool(int value);
+void trace_dump_int(long long int value);
+void trace_dump_uint(long long unsigned value);
+void trace_dump_float(double value);
+void trace_dump_bytes(const void *data, long unsigned size);
+void trace_dump_string(const char *str);
+void trace_dump_enum(const char *value);
+void trace_dump_array_begin(void);
+void trace_dump_array_end(void);
+void trace_dump_elem_begin(void);
+void trace_dump_elem_end(void);
+void trace_dump_struct_begin(const char *name);
+void trace_dump_struct_end(void);
+void trace_dump_member_begin(const char *name);
+void trace_dump_member_end(void);
+void trace_dump_null(void);
+void trace_dump_ptr(const void *value);
+
+
+/*
+ * Code saving macros. 
+ */
+
+#define trace_dump_arg(_type, _arg) \
+   do { \
+      trace_dump_arg_begin(#_arg); \
+      trace_dump_##_type(_arg); \
+      trace_dump_arg_end(); \
+   } while(0)
+
+#define trace_dump_ret(_type, _arg) \
+   do { \
+      trace_dump_ret_begin(); \
+      trace_dump_##_type(_arg); \
+      trace_dump_ret_end(); \
+   } while(0)
+
+#define trace_dump_array(_type, _obj, _size) \
+   do { \
+      unsigned long idx; \
+      trace_dump_array_begin(); \
+      for(idx = 0; idx < (_size); ++idx) { \
+         trace_dump_elem_begin(); \
+         trace_dump_##_type((_obj)[idx]); \
+         trace_dump_elem_end(); \
+      } \
+      trace_dump_array_end(); \
+   } while(0)
+
+#define trace_dump_struct_array(_type, _obj, _size) \
+   do { \
+      unsigned long idx; \
+      trace_dump_array_begin(); \
+      for(idx = 0; idx < (_size); ++idx) { \
+         trace_dump_elem_begin(); \
+         trace_dump_##_type(&(_obj)[idx]); \
+         trace_dump_elem_end(); \
+      } \
+      trace_dump_array_end(); \
+   } while(0)
+
+#define trace_dump_member(_type, _obj, _member) \
+   do { \
+      trace_dump_member_begin(#_member); \
+      trace_dump_##_type((_obj)->_member); \
+      trace_dump_member_end(); \
+   } while(0)
+
+#define trace_dump_arg_array(_type, _arg, _size) \
+   do { \
+      trace_dump_arg_begin(#_arg); \
+      trace_dump_array(_type, _arg, _size); \
+      trace_dump_arg_end(); \
+   } while(0)
+
+#define trace_dump_member_array(_type, _obj, _member) \
+   do { \
+      trace_dump_member_begin(#_member); \
+      trace_dump_array(_type, (_obj)->_member, sizeof((_obj)->_member)/sizeof((_obj)->_member[0])); \
+      trace_dump_member_end(); \
+   } while(0)
+
+
+#endif /* TR_DUMP_H */
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
new file mode 100644
index 0000000000..8789f86b1a
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -0,0 +1,469 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+
+#include "tr_dump.h"
+#include "tr_state.h"
+#include "tr_winsys.h"
+#include "tr_texture.h"
+#include "tr_screen.h"
+
+
+static const char *
+trace_screen_get_name(struct pipe_screen *_screen)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   const char *result;
+   
+   trace_dump_call_begin("pipe_screen", "get_name");
+   
+   trace_dump_arg(ptr, screen);
+
+   result = screen->get_name(screen);
+   
+   trace_dump_ret(string, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static const char *
+trace_screen_get_vendor(struct pipe_screen *_screen)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   const char *result;
+   
+   trace_dump_call_begin("pipe_screen", "get_vendor");
+   
+   trace_dump_arg(ptr, screen);
+  
+   result = screen->get_vendor(screen);
+   
+   trace_dump_ret(string, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static int 
+trace_screen_get_param(struct pipe_screen *_screen, 
+                       int param)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   int result;
+   
+   trace_dump_call_begin("pipe_screen", "get_param");
+   
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(int, param);
+
+   result = screen->get_param(screen, param);
+   
+   trace_dump_ret(int, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static float 
+trace_screen_get_paramf(struct pipe_screen *_screen, 
+                        int param)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   float result;
+   
+   trace_dump_call_begin("pipe_screen", "get_paramf");
+   
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(int, param);
+
+   result = screen->get_paramf(screen, param);
+   
+   trace_dump_ret(float, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static boolean 
+trace_screen_is_format_supported(struct pipe_screen *_screen,
+                                 enum pipe_format format,
+                                 enum pipe_texture_target target,
+                                 unsigned tex_usage, 
+                                 unsigned geom_flags)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   boolean result;
+   
+   trace_dump_call_begin("pipe_screen", "is_format_supported");
+   
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(format, format);
+   trace_dump_arg(int, target);
+   trace_dump_arg(uint, tex_usage);
+   trace_dump_arg(uint, geom_flags);
+
+   result = screen->is_format_supported(screen, format, target, tex_usage, geom_flags);
+   
+   trace_dump_ret(bool, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static struct pipe_texture *
+trace_screen_texture_create(struct pipe_screen *_screen,
+                            const struct pipe_texture *templat)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_texture *result;
+   
+   trace_dump_call_begin("pipe_screen", "texture_create");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(template, templat);
+
+   result = screen->texture_create(screen, templat);
+   
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   result = trace_texture_create(tr_scr, result);
+   
+   return result;
+}
+
+
+static struct pipe_texture *
+trace_screen_texture_blanket(struct pipe_screen *_screen,
+                             const struct pipe_texture *templat,
+                             const unsigned *ppitch,
+                             struct pipe_buffer *buffer)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   unsigned pitch = *ppitch;
+   struct pipe_texture *result;
+
+   trace_dump_call_begin("pipe_screen", "texture_blanket");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(template, templat);
+   trace_dump_arg(uint, pitch);
+   trace_dump_arg(ptr, buffer);
+
+   result = screen->texture_blanket(screen, templat, ppitch, buffer);
+   
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   result = trace_texture_create(tr_scr, result);
+   
+   return result;
+}
+
+
+static void 
+trace_screen_texture_release(struct pipe_screen *_screen,
+                             struct pipe_texture **ptexture)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct pipe_texture *texture;
+   
+   assert(ptexture);
+   if(*ptexture) {
+      tr_tex = trace_texture(tr_scr, *ptexture);
+      texture = tr_tex->texture;
+      assert(texture->screen == screen);
+   }
+   else
+      texture = NULL;
+   
+   if (*ptexture) {
+      if (!--(*ptexture)->refcount) {
+         trace_dump_call_begin("pipe_screen", "texture_destroy");
+         
+         trace_dump_arg(ptr, screen);
+         trace_dump_arg(ptr, texture);
+         
+         trace_texture_destroy(tr_scr, *ptexture);
+         
+         trace_dump_call_end();
+      }
+   
+      *ptexture = NULL;
+   }
+}
+
+
+static struct pipe_surface *
+trace_screen_get_tex_surface(struct pipe_screen *_screen,
+                             struct pipe_texture *texture,
+                             unsigned face, unsigned level,
+                             unsigned zslice,
+                             unsigned usage)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct pipe_surface *result;
+   
+   assert(texture);
+   tr_tex = trace_texture(tr_scr, texture);
+   texture = tr_tex->texture;
+   assert(texture->screen == screen);
+   
+   trace_dump_call_begin("pipe_screen", "get_tex_surface");
+   
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, texture);
+   trace_dump_arg(uint, face);
+   trace_dump_arg(uint, level);
+   trace_dump_arg(uint, zslice);
+   trace_dump_arg(uint, usage);
+
+   result = screen->get_tex_surface(screen, texture, face, level, zslice, usage);
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   result = trace_surface_create(tr_tex, result);
+
+   return result;
+}
+
+
+static void 
+trace_screen_tex_surface_release(struct pipe_screen *_screen,
+                                 struct pipe_surface **psurface)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct trace_surface *tr_surf;
+   struct pipe_surface *surface;
+   
+   assert(psurface);
+   if(*psurface) {
+      tr_tex = trace_texture(tr_scr, (*psurface)->texture);
+      tr_surf = trace_surface(tr_tex, *psurface);
+      surface = tr_surf->surface;
+   }
+   else
+      surface = NULL;
+   
+   if (*psurface) {
+      if (!--(*psurface)->refcount) {
+         trace_dump_call_begin("pipe_screen", "tex_surface_destroy");
+         
+         trace_dump_arg(ptr, screen);
+         trace_dump_arg(ptr, surface);
+
+         trace_surface_destroy(tr_tex, *psurface);
+
+         trace_dump_call_end();
+      }
+   
+      *psurface = NULL;
+   }
+}
+
+
+static void *
+trace_screen_surface_map(struct pipe_screen *_screen,
+                         struct pipe_surface *surface,
+                         unsigned flags)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct trace_surface *tr_surf;
+   void *map;
+   
+   tr_tex = trace_texture(tr_scr, surface->texture);
+   tr_surf = trace_surface(tr_tex, surface);
+   surface = tr_surf->surface;
+
+   map = screen->surface_map(screen, surface, flags);
+   if(map) {
+      if(flags & PIPE_BUFFER_USAGE_CPU_WRITE) {
+         assert(!tr_surf->map);
+         tr_surf->map = map;
+      }
+   }
+   
+   return map;
+}
+
+
+static void 
+trace_screen_surface_unmap(struct pipe_screen *_screen,
+                           struct pipe_surface *surface)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct trace_surface *tr_surf;
+   
+   tr_tex = trace_texture(tr_scr, surface->texture);
+   tr_surf = trace_surface(tr_tex, surface);
+   surface = tr_surf->surface;
+   
+   if(tr_surf->map) {
+      size_t size = surface->nblocksy * surface->stride;
+      
+      trace_dump_call_begin("pipe_winsys", "surface_write");
+      
+      trace_dump_arg(ptr, screen);
+      
+      trace_dump_arg(ptr, surface);
+      
+      trace_dump_arg_begin("data");
+      trace_dump_bytes(tr_surf->map, size);
+      trace_dump_arg_end();
+
+      trace_dump_arg_begin("stride");
+      trace_dump_uint(surface->stride);
+      trace_dump_arg_end();
+
+      trace_dump_arg_begin("size");
+      trace_dump_uint(size);
+      trace_dump_arg_end();
+   
+      trace_dump_call_end();
+
+      tr_surf->map = NULL;
+   }
+
+   screen->surface_unmap(screen, surface);
+}
+
+
+static void
+trace_screen_destroy(struct pipe_screen *_screen)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   
+   trace_dump_call_begin("pipe_screen", "destroy");
+   
+   trace_dump_arg(ptr, screen);
+
+   screen->destroy(screen);
+   
+   trace_dump_call_end();
+
+   trace_dump_trace_end();
+
+   FREE(tr_scr);
+}
+
+
+struct pipe_screen *
+trace_screen_create(struct pipe_screen *screen)
+{
+   struct trace_screen *tr_scr;
+   struct pipe_winsys *winsys;
+   
+   if(!screen)
+      goto error1;
+
+   if(!trace_dump_trace_begin())
+      goto error1;
+
+   tr_scr = CALLOC_STRUCT(trace_screen);
+   if(!tr_scr)
+      goto error2;
+
+   winsys = trace_winsys_create(screen->winsys);
+   if(!winsys)
+      goto error3;
+   
+   tr_scr->base.winsys = winsys;
+   tr_scr->base.destroy = trace_screen_destroy;
+   tr_scr->base.get_name = trace_screen_get_name;
+   tr_scr->base.get_vendor = trace_screen_get_vendor;
+   tr_scr->base.get_param = trace_screen_get_param;
+   tr_scr->base.get_paramf = trace_screen_get_paramf;
+   tr_scr->base.is_format_supported = trace_screen_is_format_supported;
+   tr_scr->base.texture_create = trace_screen_texture_create;
+   tr_scr->base.texture_blanket = trace_screen_texture_blanket;
+   tr_scr->base.texture_release = trace_screen_texture_release;
+   tr_scr->base.get_tex_surface = trace_screen_get_tex_surface;
+   tr_scr->base.tex_surface_release = trace_screen_tex_surface_release;
+   tr_scr->base.surface_map = trace_screen_surface_map;
+   tr_scr->base.surface_unmap = trace_screen_surface_unmap;
+   
+   tr_scr->screen = screen;
+
+   trace_dump_call_begin("", "pipe_screen_create");
+   trace_dump_arg_begin("winsys");
+   trace_dump_ptr(screen->winsys);
+   trace_dump_arg_end();
+   trace_dump_ret(ptr, screen);
+   trace_dump_call_end();
+
+   return &tr_scr->base;
+
+error3:
+   FREE(tr_scr);
+error2:
+   trace_dump_trace_end();
+error1:
+   return screen;
+}
+
+
+struct trace_screen *
+trace_screen(struct pipe_screen *screen)
+{
+   assert(screen);
+   assert(screen->destroy == trace_screen_destroy);
+   return (struct trace_screen *)screen;
+}
diff --git a/src/gallium/drivers/trace/tr_screen.h b/src/gallium/drivers/trace/tr_screen.h
new file mode 100644
index 0000000000..93fefdb9a5
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_screen.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_SCREEN_H_
+#define TR_SCREEN_H_
+
+
+#include "pipe/p_screen.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+struct trace_screen
+{
+   struct pipe_screen base;
+   
+   struct pipe_screen *screen;
+};
+
+
+struct trace_screen *
+trace_screen(struct pipe_screen *screen);
+
+
+struct pipe_screen *
+trace_screen_create(struct pipe_screen *screen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TR_SCREEN_H_ */
diff --git a/src/gallium/drivers/trace/tr_state.c b/src/gallium/drivers/trace/tr_state.c
new file mode 100644
index 0000000000..524f2d6194
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_state.c
@@ -0,0 +1,462 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "tr_dump.h"
+#include "tr_state.h"
+
+
+void trace_dump_format(enum pipe_format format)
+{
+   trace_dump_enum(pf_name(format) );
+}
+
+
+void trace_dump_block(const struct pipe_format_block *block)
+{
+   trace_dump_struct_begin("pipe_format_block");
+   trace_dump_member(uint, block, size);
+   trace_dump_member(uint, block, width);
+   trace_dump_member(uint, block, height);
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_template(const struct pipe_texture *templat)
+{
+   if(!templat) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_texture");
+   
+   trace_dump_member(int, templat, target);
+   trace_dump_member(format, templat, format);
+   
+   trace_dump_member_begin("width");
+   trace_dump_array(uint, templat->width, 1);
+   trace_dump_member_end();
+
+   trace_dump_member_begin("height");
+   trace_dump_array(uint, templat->height, 1);
+   trace_dump_member_end();
+
+   trace_dump_member_begin("depth");
+   trace_dump_array(uint, templat->depth, 1);
+   trace_dump_member_end();
+
+   trace_dump_member_begin("block");
+   trace_dump_block(&templat->block);
+   trace_dump_member_end();
+   
+   trace_dump_member(uint, templat, last_level);
+   trace_dump_member(uint, templat, tex_usage);
+   
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_rasterizer_state(const struct pipe_rasterizer_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_rasterizer_state");
+
+   trace_dump_member(bool, state, flatshade);
+   trace_dump_member(bool, state, light_twoside);
+   trace_dump_member(uint, state, front_winding);
+   trace_dump_member(uint, state, cull_mode);
+   trace_dump_member(uint, state, fill_cw);
+   trace_dump_member(uint, state, fill_ccw);
+   trace_dump_member(bool, state, offset_cw);
+   trace_dump_member(bool, state, offset_ccw);
+   trace_dump_member(bool, state, scissor);
+   trace_dump_member(bool, state, poly_smooth);
+   trace_dump_member(bool, state, poly_stipple_enable);
+   trace_dump_member(bool, state, point_smooth);
+   trace_dump_member(bool, state, point_sprite);
+   trace_dump_member(bool, state, point_size_per_vertex);
+   trace_dump_member(bool, state, multisample);
+   trace_dump_member(bool, state, line_smooth);
+   trace_dump_member(bool, state, line_stipple_enable);
+   trace_dump_member(uint, state, line_stipple_factor);
+   trace_dump_member(uint, state, line_stipple_pattern);
+   trace_dump_member(bool, state, line_last_pixel);
+   trace_dump_member(bool, state, bypass_clipping);
+   trace_dump_member(bool, state, bypass_vs);
+   trace_dump_member(bool, state, origin_lower_left);
+   trace_dump_member(bool, state, flatshade_first);
+   trace_dump_member(bool, state, gl_rasterization_rules);
+
+   trace_dump_member(float, state, line_width);
+   trace_dump_member(float, state, point_size);
+   trace_dump_member(float, state, point_size_min);
+   trace_dump_member(float, state, point_size_max);
+   trace_dump_member(float, state, offset_units);
+   trace_dump_member(float, state, offset_scale);
+   
+   trace_dump_member_array(uint, state, sprite_coord_mode);
+   
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_poly_stipple(const struct pipe_poly_stipple *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_poly_stipple");
+
+   trace_dump_member_begin("stipple");
+   trace_dump_array(uint,
+                    state->stipple, 
+                    Elements(state->stipple));
+   trace_dump_member_end();
+   
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_viewport_state(const struct pipe_viewport_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_viewport_state");
+
+   trace_dump_member_array(float, state, scale);
+   trace_dump_member_array(float, state, translate);
+   
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_scissor_state(const struct pipe_scissor_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_scissor_state");
+
+   trace_dump_member(uint, state, minx);
+   trace_dump_member(uint, state, miny);
+   trace_dump_member(uint, state, maxx);
+   trace_dump_member(uint, state, maxy);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_clip_state(const struct pipe_clip_state *state)
+{
+   unsigned i;
+   
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_clip_state");
+
+   trace_dump_member_begin("ucp");
+   trace_dump_array_begin();
+   for(i = 0; i < PIPE_MAX_CLIP_PLANES; ++i) {
+      trace_dump_elem_begin();
+      trace_dump_array(float, state->ucp[i], 4);
+      trace_dump_elem_end();
+   }
+   trace_dump_array_end();
+   trace_dump_member_end();
+
+   trace_dump_member(uint, state, nr);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_constant_buffer(const struct pipe_constant_buffer *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_constant_buffer");
+
+   trace_dump_member(ptr, state, buffer);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_shader_state(const struct pipe_shader_state *state)
+{
+   static char str[8192];
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   tgsi_dump_str(state->tokens, 0, str, sizeof(str));
+   
+   trace_dump_struct_begin("pipe_shader_state");
+
+   trace_dump_member_begin("tokens");
+   trace_dump_string(str);
+   trace_dump_member_end();
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_depth_stencil_alpha_state(const struct pipe_depth_stencil_alpha_state *state)
+{
+   unsigned i;
+   
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_depth_stencil_alpha_state");
+
+   trace_dump_member_begin("depth");
+   trace_dump_struct_begin("pipe_depth_state");
+   trace_dump_member(bool, &state->depth, enabled);
+   trace_dump_member(bool, &state->depth, writemask);
+   trace_dump_member(uint, &state->depth, func);
+   trace_dump_member(bool, &state->depth, occlusion_count);
+   trace_dump_struct_end();
+   trace_dump_member_end();
+   
+   trace_dump_member_begin("stencil");
+   trace_dump_array_begin();
+   for(i = 0; i < Elements(state->stencil); ++i) {
+      trace_dump_elem_begin();
+      trace_dump_struct_begin("pipe_stencil_state");
+      trace_dump_member(bool, &state->stencil[i], enabled);
+      trace_dump_member(uint, &state->stencil[i], func);
+      trace_dump_member(uint, &state->stencil[i], fail_op);
+      trace_dump_member(uint, &state->stencil[i], zpass_op);
+      trace_dump_member(uint, &state->stencil[i], zfail_op);
+      trace_dump_member(uint, &state->stencil[i], ref_value);
+      trace_dump_member(uint, &state->stencil[i], valuemask);
+      trace_dump_member(uint, &state->stencil[i], writemask);
+      trace_dump_struct_end();
+      trace_dump_elem_end();
+   }
+   trace_dump_array_end();
+   trace_dump_member_end();
+
+   trace_dump_member_begin("alpha");
+   trace_dump_struct_begin("pipe_alpha_state");
+   trace_dump_member(bool, &state->alpha, enabled);
+   trace_dump_member(uint, &state->alpha, func);
+   trace_dump_member(float, &state->alpha, ref_value);
+   trace_dump_struct_end();
+   trace_dump_member_end();
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_blend_state(const struct pipe_blend_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_blend_state");
+
+   trace_dump_member(bool, state, blend_enable);
+
+   trace_dump_member(uint, state, rgb_func);
+   trace_dump_member(uint, state, rgb_src_factor);
+   trace_dump_member(uint, state, rgb_dst_factor);
+
+   trace_dump_member(uint, state, alpha_func);
+   trace_dump_member(uint, state, alpha_src_factor);
+   trace_dump_member(uint, state, alpha_dst_factor);
+
+   trace_dump_member(bool, state, logicop_enable);
+   trace_dump_member(uint, state, logicop_func);
+
+   trace_dump_member(uint, state, colormask);
+   trace_dump_member(bool, state, dither);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_blend_color(const struct pipe_blend_color *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_blend_color");
+
+   trace_dump_member_array(float, state, color);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_framebuffer_state(const struct pipe_framebuffer_state *state)
+{
+   trace_dump_struct_begin("pipe_framebuffer_state");
+
+   trace_dump_member(uint, state, width);
+   trace_dump_member(uint, state, height);
+   trace_dump_member(uint, state, nr_cbufs);
+   trace_dump_member_array(ptr, state, cbufs);
+   trace_dump_member(ptr, state, zsbuf);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_sampler_state(const struct pipe_sampler_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_sampler_state");
+
+   trace_dump_member(uint, state, wrap_s);
+   trace_dump_member(uint, state, wrap_t);
+   trace_dump_member(uint, state, wrap_r);
+   trace_dump_member(uint, state, min_img_filter);
+   trace_dump_member(uint, state, min_mip_filter);
+   trace_dump_member(uint, state, mag_img_filter);
+   trace_dump_member(bool, state, compare_mode);
+   trace_dump_member(uint, state, compare_func);
+   trace_dump_member(bool, state, normalized_coords);
+   trace_dump_member(uint, state, prefilter);
+   trace_dump_member(float, state, shadow_ambient);
+   trace_dump_member(float, state, lod_bias);
+   trace_dump_member(float, state, min_lod);
+   trace_dump_member(float, state, max_lod);
+   trace_dump_member_array(float, state, border_color);
+   trace_dump_member(float, state, max_anisotropy);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_surface(const struct pipe_surface *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_surface");
+
+   trace_dump_member(format, state, format);
+   trace_dump_member(uint, state, status);
+   trace_dump_member(uint, state, clear_value);
+   trace_dump_member(uint, state, width);
+   trace_dump_member(uint, state, height);
+
+   trace_dump_member_begin("block");
+   trace_dump_block(&state->block);
+   trace_dump_member_end();
+   
+   trace_dump_member(uint, state, nblocksx);
+   trace_dump_member(uint, state, nblocksy);
+   trace_dump_member(uint, state, stride);
+   trace_dump_member(uint, state, layout);
+   trace_dump_member(uint, state, offset);
+   trace_dump_member(uint, state, refcount);
+   trace_dump_member(uint, state, usage);
+
+   trace_dump_member(ptr, state, texture);
+   trace_dump_member(uint, state, face);
+   trace_dump_member(uint, state, level);
+   trace_dump_member(uint, state, zslice);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_vertex_buffer(const struct pipe_vertex_buffer *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_vertex_buffer");
+
+   trace_dump_member(uint, state, stride);
+   trace_dump_member(uint, state, max_index);
+   trace_dump_member(uint, state, buffer_offset);
+   trace_dump_member(ptr, state, buffer);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_vertex_element(const struct pipe_vertex_element *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_vertex_element");
+
+   trace_dump_member(uint, state, src_offset);
+
+   trace_dump_member(uint, state, vertex_buffer_index);
+   trace_dump_member(uint, state, nr_components);
+ 
+   trace_dump_member(format, state, src_format);
+
+   trace_dump_struct_end();
+}
diff --git a/src/gallium/drivers/trace/tr_state.h b/src/gallium/drivers/trace/tr_state.h
new file mode 100644
index 0000000000..5ae533dc66
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_state.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_STATE_H
+#define TR_STATE_H
+
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+
+
+void trace_dump_format(enum pipe_format format);
+
+void trace_dump_block(const struct pipe_format_block *block);
+
+void trace_dump_template(const struct pipe_texture *templat);
+
+
+void trace_dump_rasterizer_state(const struct pipe_rasterizer_state *state);
+
+void trace_dump_poly_stipple(const struct pipe_poly_stipple *state);
+
+void trace_dump_viewport_state(const struct pipe_viewport_state *state);
+
+void trace_dump_scissor_state(const struct pipe_scissor_state *state);
+
+void trace_dump_clip_state(const struct pipe_clip_state *state);
+
+void trace_dump_constant_buffer(const struct pipe_constant_buffer *state);
+
+void trace_dump_token(const struct tgsi_token *token);
+
+void trace_dump_shader_state(const struct pipe_shader_state *state);
+
+void trace_dump_depth_stencil_alpha_state(const struct pipe_depth_stencil_alpha_state *state);
+
+void trace_dump_blend_state(const struct pipe_blend_state *state);
+
+void trace_dump_blend_color(const struct pipe_blend_color *state);
+
+void trace_dump_framebuffer_state(const struct pipe_framebuffer_state *state);
+
+void trace_dump_sampler_state(const struct pipe_sampler_state *state);
+
+void trace_dump_surface(const struct pipe_surface *state);
+
+void trace_dump_vertex_buffer(const struct pipe_vertex_buffer *state);
+
+void trace_dump_vertex_element(const struct pipe_vertex_element *state);
+
+
+#endif /* TR_STATE_H */
diff --git a/src/gallium/drivers/trace/tr_texture.c b/src/gallium/drivers/trace/tr_texture.c
new file mode 100644
index 0000000000..1cc4f0bd43
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_texture.c
@@ -0,0 +1,111 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_inlines.h"
+#include "util/u_hash_table.h"
+#include "util/u_memory.h"
+
+#include "tr_screen.h"
+#include "tr_texture.h"
+
+
+struct pipe_texture *
+trace_texture_create(struct trace_screen *tr_scr, 
+                     struct pipe_texture *texture)
+{
+   struct trace_texture *tr_tex;
+   
+   if(!texture)
+      goto error;
+   
+   assert(texture->screen == tr_scr->screen);
+   
+   tr_tex = CALLOC_STRUCT(trace_texture);
+   if(!tr_tex)
+      goto error;
+   
+   memcpy(&tr_tex->base, texture, sizeof(struct pipe_texture));
+   tr_tex->base.screen = &tr_scr->base;
+   tr_tex->texture = texture;
+   
+   return &tr_tex->base;
+   
+error:
+   pipe_texture_reference(&texture, NULL);
+   return NULL;
+}
+
+
+void
+trace_texture_destroy(struct trace_screen *tr_scr, 
+                      struct pipe_texture *texture)
+{
+   struct trace_texture *tr_tex = trace_texture(tr_scr, texture); 
+   pipe_texture_reference(&tr_tex->texture, NULL);
+   FREE(tr_tex);
+}
+
+
+struct pipe_surface *
+trace_surface_create(struct trace_texture *tr_tex, 
+                     struct pipe_surface *surface)
+{
+   struct trace_surface *tr_surf;
+   
+   if(!surface)
+      goto error;
+   
+   assert(surface->texture == tr_tex->texture);
+   
+   tr_surf = CALLOC_STRUCT(trace_surface);
+   if(!tr_surf)
+      goto error;
+   
+   memcpy(&tr_surf->base, surface, sizeof(struct pipe_surface));
+   
+   tr_surf->base.texture = NULL;
+   pipe_texture_reference(&tr_surf->base.texture, &tr_tex->base);
+   tr_surf->surface = surface;
+
+   return &tr_surf->base;
+   
+error:
+   pipe_surface_reference(&surface, NULL);
+   return NULL;
+}
+
+
+void
+trace_surface_destroy(struct trace_texture *tr_tex, 
+                      struct pipe_surface *surface)
+{
+   struct trace_surface *tr_surf = trace_surface(tr_tex, surface);
+   pipe_texture_reference(&tr_surf->base.texture, NULL);
+   pipe_surface_reference(&tr_surf->surface, NULL);
+   FREE(tr_surf);
+}
+
diff --git a/src/gallium/drivers/trace/tr_texture.h b/src/gallium/drivers/trace/tr_texture.h
new file mode 100644
index 0000000000..9e72edb8a3
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_texture.h
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_TEXTURE_H_
+#define TR_TEXTURE_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "tr_screen.h"
+
+
+struct trace_texture
+{
+   struct pipe_texture base;
+
+   struct pipe_texture *texture;
+};
+
+
+struct trace_surface
+{
+   struct pipe_surface base;
+
+   struct pipe_surface *surface;
+   
+   void *map;
+};
+
+
+static INLINE struct trace_texture *
+trace_texture(struct trace_screen *tr_scr, 
+              struct pipe_texture *texture)
+{
+   if(!texture)
+      return NULL;
+   assert(texture->screen == &tr_scr->base);
+   return (struct trace_texture *)texture;
+}
+
+
+static INLINE struct trace_surface *
+trace_surface(struct trace_texture *tr_tex, 
+              struct pipe_surface *surface)
+{
+   if(!surface)
+      return NULL;
+   assert(surface->texture == &tr_tex->base);
+   return (struct trace_surface *)surface;
+}
+
+
+struct pipe_texture *
+trace_texture_create(struct trace_screen *tr_scr, 
+                     struct pipe_texture *texture);
+
+void
+trace_texture_destroy(struct trace_screen *tr_scr, 
+                      struct pipe_texture *texture);
+
+struct pipe_surface *
+trace_surface_create(struct trace_texture *tr_tex, 
+                     struct pipe_surface *surface);
+
+void
+trace_surface_destroy(struct trace_texture *tr_tex,
+                      struct pipe_surface *surface);
+
+
+#endif /* TR_TEXTURE_H_ */
diff --git a/src/gallium/drivers/trace/tr_winsys.c b/src/gallium/drivers/trace/tr_winsys.c
new file mode 100644
index 0000000000..c4148fe810
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_winsys.c
@@ -0,0 +1,450 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "util/u_hash_table.h"
+
+#include "tr_dump.h"
+#include "tr_state.h"
+#include "tr_screen.h"
+#include "tr_texture.h"
+#include "tr_winsys.h"
+
+
+static unsigned trace_buffer_hash(void *buffer)
+{
+   return (unsigned)(uintptr_t)buffer;
+}
+
+
+static int trace_buffer_compare(void *buffer1, void *buffer2)
+{
+   return (char *)buffer2 - (char *)buffer1;
+}
+
+                  
+static const char *
+trace_winsys_get_name(struct pipe_winsys *_winsys)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   const char *result;
+   
+   trace_dump_call_begin("pipe_winsys", "get_name");
+   
+   trace_dump_arg(ptr, winsys);
+
+   result = winsys->get_name(winsys);
+   
+   trace_dump_ret(string, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static void 
+trace_winsys_flush_frontbuffer(struct pipe_winsys *_winsys,
+                               struct pipe_surface *surface,
+                               void *context_private)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+
+   assert(surface);
+   if(surface->texture) {
+      struct trace_screen *tr_scr = trace_screen(surface->texture->screen);
+      struct trace_texture *tr_tex = trace_texture(tr_scr, surface->texture);
+      struct trace_surface *tr_surf = trace_surface(tr_tex, surface);
+      surface = tr_surf->surface;
+   }
+   
+   trace_dump_call_begin("pipe_winsys", "flush_frontbuffer");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, surface);
+   /* XXX: hide, as there is nothing we can do with this
+   trace_dump_arg(ptr, context_private);
+   */
+
+   winsys->flush_frontbuffer(winsys, surface, context_private);
+   
+   trace_dump_call_end();
+}
+
+
+static struct pipe_buffer *
+trace_winsys_surface_buffer_create(struct pipe_winsys *_winsys,
+                                   unsigned width, unsigned height,
+                                   enum pipe_format format,
+                                   unsigned usage,
+                                   unsigned *pstride)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   unsigned stride;
+   struct pipe_buffer *result;
+   
+   trace_dump_call_begin("pipe_winsys", "surface_buffer_create");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(uint, width);
+   trace_dump_arg(uint, height);
+   trace_dump_arg(format, format);
+   trace_dump_arg(uint, usage);
+
+   result = winsys->surface_buffer_create(winsys,
+                                          width, height,
+                                          format,
+                                          usage,
+                                          pstride);
+   
+   stride = *pstride;
+   
+   trace_dump_arg(uint, stride);
+   
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static struct pipe_buffer *
+trace_winsys_buffer_create(struct pipe_winsys *_winsys, 
+                           unsigned alignment, 
+                           unsigned usage,
+                           unsigned size)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   struct pipe_buffer *buffer;
+   
+   trace_dump_call_begin("pipe_winsys", "buffer_create");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(uint, alignment);
+   trace_dump_arg(uint, usage);
+   trace_dump_arg(uint, size);
+
+   buffer = winsys->buffer_create(winsys, alignment, usage, size);
+   
+   trace_dump_ret(ptr, buffer);
+   
+   trace_dump_call_end();
+
+   /* Zero the buffer to avoid dumping uninitialized memory */
+   if(buffer->usage & PIPE_BUFFER_USAGE_CPU_WRITE) {
+      void *map;
+      map = winsys->buffer_map(winsys, buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+      if(map) {
+         memset(map, 0, buffer->size);
+         winsys->buffer_unmap(winsys, buffer);
+      }
+   }
+   
+   return buffer;
+}
+
+
+static struct pipe_buffer *
+trace_winsys_user_buffer_create(struct pipe_winsys *_winsys, 
+                                void *data,
+                                unsigned size)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   struct pipe_buffer *result;
+   
+   trace_dump_call_begin("pipe_winsys", "user_buffer_create");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg_begin("data");
+   trace_dump_bytes(data, size);
+   trace_dump_arg_end();
+   trace_dump_arg(uint, size);
+
+   result = winsys->user_buffer_create(winsys, data, size);
+   
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   /* XXX: Mark the user buffers. (we should wrap pipe_buffers, but is is 
+    * impossible to do so while texture-less surfaces are still around */
+   if(result) {
+      assert(!(result->usage & TRACE_BUFFER_USAGE_USER));
+      result->usage |= TRACE_BUFFER_USAGE_USER;
+   }
+   
+   return result;
+}
+
+
+void
+trace_winsys_user_buffer_update(struct pipe_winsys *_winsys, 
+                                struct pipe_buffer *buffer)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   const void *map;
+   
+   if(buffer && buffer->usage & TRACE_BUFFER_USAGE_USER) {
+      map = winsys->buffer_map(winsys, buffer, PIPE_BUFFER_USAGE_CPU_READ);
+      if(map) {
+         trace_dump_call_begin("pipe_winsys", "buffer_write");
+         
+         trace_dump_arg(ptr, winsys);
+         
+         trace_dump_arg(ptr, buffer);
+         
+         trace_dump_arg_begin("data");
+         trace_dump_bytes(map, buffer->size);
+         trace_dump_arg_end();
+      
+         trace_dump_arg_begin("size");
+         trace_dump_uint(buffer->size);
+         trace_dump_arg_end();
+      
+         trace_dump_call_end();
+         
+         winsys->buffer_unmap(winsys, buffer);
+      }
+   }
+}
+
+
+static void *
+trace_winsys_buffer_map(struct pipe_winsys *_winsys, 
+                        struct pipe_buffer *buffer,
+                        unsigned usage)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   void *map;
+   
+   map = winsys->buffer_map(winsys, buffer, usage);
+   if(map) {
+      if(usage & PIPE_BUFFER_USAGE_CPU_WRITE) {
+         assert(!hash_table_get(tr_ws->buffer_maps, buffer));
+         hash_table_set(tr_ws->buffer_maps, buffer, map);
+      }
+   }
+   
+   return map;
+}
+
+
+static void
+trace_winsys_buffer_unmap(struct pipe_winsys *_winsys, 
+                          struct pipe_buffer *buffer)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   const void *map;
+   
+   map = hash_table_get(tr_ws->buffer_maps, buffer);
+   if(map) {
+      trace_dump_call_begin("pipe_winsys", "buffer_write");
+      
+      trace_dump_arg(ptr, winsys);
+      
+      trace_dump_arg(ptr, buffer);
+      
+      trace_dump_arg_begin("data");
+      trace_dump_bytes(map, buffer->size);
+      trace_dump_arg_end();
+
+      trace_dump_arg_begin("size");
+      trace_dump_uint(buffer->size);
+      trace_dump_arg_end();
+   
+      trace_dump_call_end();
+
+      hash_table_remove(tr_ws->buffer_maps, buffer);
+   }
+   
+   winsys->buffer_unmap(winsys, buffer);
+}
+
+
+static void
+trace_winsys_buffer_destroy(struct pipe_winsys *_winsys,
+                            struct pipe_buffer *buffer)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   
+   trace_dump_call_begin("pipe_winsys", "buffer_destroy");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, buffer);
+
+   winsys->buffer_destroy(winsys, buffer);
+   
+   trace_dump_call_end();
+}
+
+
+static void
+trace_winsys_fence_reference(struct pipe_winsys *_winsys,
+                             struct pipe_fence_handle **pdst,
+                             struct pipe_fence_handle *src)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   struct pipe_fence_handle *dst = *pdst;
+   
+   trace_dump_call_begin("pipe_winsys", "fence_reference");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, dst);
+   trace_dump_arg(ptr, src);
+
+   winsys->fence_reference(winsys, pdst, src);
+   
+   trace_dump_call_end();
+}
+
+
+static int
+trace_winsys_fence_signalled(struct pipe_winsys *_winsys,
+                             struct pipe_fence_handle *fence,
+                             unsigned flag)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   int result;
+   
+   trace_dump_call_begin("pipe_winsys", "fence_signalled");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, fence);
+   trace_dump_arg(uint, flag);
+
+   result = winsys->fence_signalled(winsys, fence, flag);
+   
+   trace_dump_ret(int, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static int
+trace_winsys_fence_finish(struct pipe_winsys *_winsys,
+                          struct pipe_fence_handle *fence,
+                          unsigned flag)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   int result;
+   
+   trace_dump_call_begin("pipe_winsys", "fence_finish");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, fence);
+   trace_dump_arg(uint, flag);
+
+   result = winsys->fence_finish(winsys, fence, flag);
+   
+   trace_dump_ret(int, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static void
+trace_winsys_destroy(struct pipe_winsys *_winsys)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   
+   trace_dump_call_begin("pipe_winsys", "destroy");
+   
+   trace_dump_arg(ptr, winsys);
+
+   /* 
+   winsys->destroy(winsys); 
+   */
+   
+   trace_dump_call_end();
+   
+   hash_table_destroy(tr_ws->buffer_maps);
+
+   FREE(tr_ws);
+}
+
+
+struct pipe_winsys *
+trace_winsys_create(struct pipe_winsys *winsys)
+{
+   struct trace_winsys *tr_ws;
+   
+   if(!winsys)
+      goto error1;
+   
+   tr_ws = CALLOC_STRUCT(trace_winsys);
+   if(!tr_ws)
+      goto error1;
+
+   tr_ws->base.destroy = trace_winsys_destroy;
+   tr_ws->base.get_name = trace_winsys_get_name;
+   tr_ws->base.flush_frontbuffer = trace_winsys_flush_frontbuffer;
+   tr_ws->base.surface_buffer_create = trace_winsys_surface_buffer_create;
+   tr_ws->base.buffer_create = trace_winsys_buffer_create;
+   tr_ws->base.user_buffer_create = trace_winsys_user_buffer_create;
+   tr_ws->base.buffer_map = trace_winsys_buffer_map;
+   tr_ws->base.buffer_unmap = trace_winsys_buffer_unmap;
+   tr_ws->base.buffer_destroy = trace_winsys_buffer_destroy;
+   tr_ws->base.fence_reference = trace_winsys_fence_reference;
+   tr_ws->base.fence_signalled = trace_winsys_fence_signalled;
+   tr_ws->base.fence_finish = trace_winsys_fence_finish;
+   
+   tr_ws->winsys = winsys;
+
+   tr_ws->buffer_maps = hash_table_create(trace_buffer_hash, 
+                                          trace_buffer_compare);
+   if(!tr_ws->buffer_maps)
+      goto error2;
+   
+   trace_dump_call_begin("", "pipe_winsys_create");
+   trace_dump_ret(ptr, winsys);
+   trace_dump_call_end();
+
+   return &tr_ws->base;
+   
+error2:
+   FREE(tr_ws);
+error1:
+   return winsys;
+}
diff --git a/src/gallium/drivers/trace/tr_winsys.h b/src/gallium/drivers/trace/tr_winsys.h
new file mode 100644
index 0000000000..0fd2a40556
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_winsys.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_WINSYS_H_
+#define TR_WINSYS_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+
+/**
+ * It often happens that new data is written directly to the user buffers 
+ * without mapping/unmapping. This flag marks user buffers, so that their 
+ * contents can be dumpped before being used by the pipe context.
+ */
+#define TRACE_BUFFER_USAGE_USER  (1 << 31)
+
+
+struct hash_table;
+
+
+struct trace_winsys
+{
+   struct pipe_winsys base;
+   
+   struct pipe_winsys *winsys;
+   
+   struct hash_table *buffer_maps;
+};
+
+
+static INLINE struct trace_winsys *
+trace_winsys(struct pipe_winsys *winsys)
+{
+   assert(winsys);
+   return (struct trace_winsys *)winsys;
+}
+
+
+
+struct pipe_winsys *
+trace_winsys_create(struct pipe_winsys *winsys);
+
+
+void
+trace_winsys_user_buffer_update(struct pipe_winsys *winsys, 
+                                struct pipe_buffer *buffer);
+
+
+#endif /* TR_WINSYS_H_ */
diff --git a/src/gallium/drivers/trace/trace.xsl b/src/gallium/drivers/trace/trace.xsl
new file mode 100644
index 0000000000..9cd621e7ab
--- /dev/null
+++ b/src/gallium/drivers/trace/trace.xsl
@@ -0,0 +1,185 @@
+<?xml version="1.0"?>
+
+<!--
+
+Copyright 2008 Tungsten Graphics, Inc.
+
+This program is free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+!-->
+
+<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+	<xsl:output method="html" />
+
+	<xsl:strip-space elements="*" />
+
+	<xsl:template match="/trace">
+		<html>
+			<head>
+				<title>Gallium Trace</title>
+			</head>
+			<style>
+				body {
+					font-family: verdana, sans-serif;
+					font-size: 11px;
+					font-weight: normal;
+					text-align : left;
+				}
+
+				.fun {
+					font-weight: bold;
+				}
+
+				.var {
+					font-style: italic;
+				}
+
+				.typ {
+					display: none;
+				}
+
+				.lit {
+					color: #0000ff;
+				}
+
+				.ptr {
+					color: #008000;
+				}
+			</style>
+			<body>
+				<ol class="calls">
+					<xsl:apply-templates/>
+				</ol>
+			</body>
+		</html>
+	</xsl:template>
+
+	<xsl:template match="call">
+		<li>
+			<span class="fun">
+				<xsl:value-of select="@class"/>
+				<xsl:text>::</xsl:text>
+				<xsl:value-of select="@method"/>
+			</span>
+			<xsl:text>(</xsl:text>
+			<xsl:apply-templates select="arg"/>
+			<xsl:text>)</xsl:text>
+			<xsl:apply-templates select="ret"/>
+		</li>
+	</xsl:template>
+
+	<xsl:template match="arg|member">
+			<xsl:apply-templates select="@name"/>
+			<xsl:text> = </xsl:text>
+			<xsl:apply-templates />
+			<xsl:if test="position() != last()">
+				<xsl:text>, </xsl:text>
+			</xsl:if>
+	</xsl:template>
+
+	<xsl:template match="ret">
+		<xsl:text> = </xsl:text>
+		<xsl:apply-templates />
+	</xsl:template>
+
+	<xsl:template match="bool|int|uint|float|enum">
+		<span class="lit">
+			<xsl:value-of select="text()"/>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="bytes">
+		<span class="lit">
+			<xsl:text>...</xsl:text>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="string">
+		<span class="lit">
+			<xsl:text>"</xsl:text>
+			<xsl:call-template name="break">
+				<xsl:with-param name="text" select="text()"/>
+			</xsl:call-template>
+			<xsl:text>"</xsl:text>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="array|struct">
+		<xsl:text>{</xsl:text>
+		<xsl:apply-templates />
+		<xsl:text>}</xsl:text>
+	</xsl:template>
+
+	<xsl:template match="elem">
+		<xsl:apply-templates />
+		<xsl:if test="position() != last()">
+			<xsl:text>, </xsl:text>
+		</xsl:if>
+	</xsl:template>
+
+	<xsl:template match="null">
+		<span class="ptr">
+			<xsl:text>NULL</xsl:text>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="ptr">
+		<span class="ptr">
+			<xsl:value-of select="text()"/>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="@name">
+		<span class="var">
+			<xsl:value-of select="."/>
+		</span>
+	</xsl:template>
+	
+	<xsl:template name="break">
+		<xsl:param name="text" select="."/>
+		<xsl:choose>
+			<xsl:when test="contains($text, '&#xa;')">
+				<xsl:value-of select="substring-before($text, '&#xa;')"/>
+				<br/>
+				<xsl:call-template name="break">
+					 <xsl:with-param name="text" select="substring-after($text, '&#xa;')"/>
+				</xsl:call-template>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:value-of select="$text"/>
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+
+	<xsl:template name="replace">
+		<xsl:param name="text"/>
+		<xsl:param name="from"/>
+		<xsl:param name="to"/>
+		<xsl:choose>
+			<xsl:when test="contains($text,$from)">
+				<xsl:value-of select="concat(substring-before($text,$from),$to)"/>
+				<xsl:call-template name="replace">
+					<xsl:with-param name="text" select="substring-after($text,$from)"/>
+					<xsl:with-param name="from" select="$from"/>
+					<xsl:with-param name="to" select="$to"/>
+				</xsl:call-template>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:value-of select="$text"/>
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+
+</xsl:transform>